diff --git a/models/common.py b/models/common.py
index 049dfc0b9..63e047788 100644
--- a/models/common.py
+++ b/models/common.py
@@ -59,9 +59,26 @@ from utils.torch_utils import copy_attr, smart_inference_mode
 
 def autopad(k, p=None, d=1):
     """
-    Pads kernel to 'same' output shape, adjusting for optional dilation; returns padding size.
-
-    `k`: kernel, `p`: padding, `d`: dilation.
+    Pads kernel to achieve 'same' output shape, taking into account optional dilation.
+    
+    Args:
+        k (int | list[int]): Size of the kernel. Supports single integer or list of integers for each dimension.
+        p (None | int | list[int]): Padding size. If None, computes 'same' padding automatically. Default is None.
+        d (int): Dilation rate to apply to the kernel. Defaults to 1.
+    
+    Returns:
+        (int | list[int]): Calculated padding size. Returns a single integer if the kernel size is an integer, otherwise a 
+            list of integers matching the dimensions of the kernel.
+    
+    Example:
+        ```python
+        pad_size = autopad(3)  # For a single dimension kernel of size 3, dilation 1
+        pad_sizes = autopad([3, 3], d=2)  # For a 2D kernel with size 3x3 and dilation 2
+        ```
+    
+    Note:
+        This function is commonly used when creating neural network architectures to ensure the output dimensions
+        match the input dimensions, facilitating easy model design and debugging.
     """
     if d > 1:
         k = d * (k - 1) + 1 if isinstance(k, int) else [d * (x - 1) + 1 for x in k]  # actual kernel-size
@@ -75,26 +92,108 @@ class Conv(nn.Module):
     default_act = nn.SiLU()  # default activation
 
     def __init__(self, c1, c2, k=1, s=1, p=None, g=1, d=1, act=True):
-        """Initializes a standard convolution layer with optional batch normalization and activation."""
+        """
+        Initialize a convolutional layer with batch normalization and an optional activation function.
+        
+        Args:
+            c1 (int): Number of input channels.
+            c2 (int): Number of output channels.
+            k (int): Kernel size. Default is 1.
+            s (int): Stride size. Default is 1.
+            p (None | int): Padding size. If None, the padding is computed as 'same' padding. Default is None.
+            g (int): Number of groups for group convolution. Default is 1.
+            d (int): Dilation rate. Default is 1.
+            act (bool or torch.nn.Module): If True, uses the default activation function (SiLU), otherwise no activation 
+                is applied. You can also provide a custom activation function. Default is True.
+        
+        Returns:
+            (None): This is an initialization method, so it does not return anything.
+        
+        Example:
+            ```python
+            # Creating a convolutional layer with 3 input channels, 16 output channels, kernel size 3, stride 1, and ReLU activation
+            conv_layer = Conv(3, 16, k=3, s=1, act=torch.nn.ReLU())
+            ```
+        
+        Note:
+            The default activation function used is SiLU if `act` is set to True. You can replace it with other activation 
+            functions by passing the desired nn.Module as the `act` argument.
+        """
         super().__init__()
         self.conv = nn.Conv2d(c1, c2, k, s, autopad(k, p, d), groups=g, dilation=d, bias=False)
         self.bn = nn.BatchNorm2d(c2)
         self.act = self.default_act if act is True else act if isinstance(act, nn.Module) else nn.Identity()
 
     def forward(self, x):
-        """Applies a convolution followed by batch normalization and an activation function to the input tensor `x`."""
+        """
+        Perform convolution, batch normalization, and activation on the input tensor `x` in sequence.
+        
+        Args:
+            x (torch.Tensor): Input tensor with shape (N, C_in, H, W), where N is the batch size, C_in is the number of input 
+                channels, H is the height, and W is the width.
+        
+        Returns:
+            (torch.Tensor): Output tensor after applying convolution, batch normalization, and activation, with shape 
+                (N, C_out, H_out, W_out) where C_out is the number of output channels and H_out, W_out are the heights and
+                widths of the output based on the kernel size, stride, and padding.
+        
+        Example:
+            ```python
+            conv_layer = Conv(3, 16, k=3, s=1, p=1)
+            input_tensor = torch.randn(1, 3, 224, 224)  # Batch size 1, 3 input channels, 224x224 image
+            output_tensor = conv_layer(input_tensor)
+            ```
+        
+        Note:
+            This forward pass integrates three operations: a convolution, batch normalization, and an optional activation function
+            (default is nn.SiLU).
+        """
         return self.act(self.bn(self.conv(x)))
 
     def forward_fuse(self, x):
-        """Applies a fused convolution and activation function to the input tensor `x`."""
+        """
+        Apply convolution and activation without batch normalization for optimized inference.
+        
+        Args:
+            x (torch.Tensor): Input tensor with shape (N, C, H, W), typically a feature map from previous layers.
+        
+        Returns:
+            (torch.Tensor): Output tensor after applying convolution and activation, with shape (N, C', H', W') where the 
+                output channels C' may differ from input channels due to the convolution operations.
+        
+        Example:
+            ```python
+            conv_layer = Conv(3, 16, k=3, s=1, act=True)
+            fused_output = conv_layer.forward_fuse(torch.rand(1, 3, 224, 224))
+            ```
+        """
         return self.act(self.conv(x))
 
 
 class DWConv(Conv):
     # Depth-wise convolution
     def __init__(self, c1, c2, k=1, s=1, d=1, act=True):
-        """Initializes a depth-wise convolution layer with optional activation; args: input channels (c1), output
-        channels (c2), kernel size (k), stride (s), dilation (d), and activation flag (act).
+        """
+        Initializes a depth-wise convolution layer with optional activation.
+        
+        Args:
+            c1 (int): Number of input channels (C1).
+            c2 (int): Number of output channels (C2).
+            k (int): Kernel size. Defaults to 1.
+            s (int): Stride size. Defaults to 1.
+            d (int): Dilation rate. Defaults to 1.
+            act (bool | nn.Module): Activation function or flag. If True, SiLU activation is used. If a 
+                nn.Module is provided, it is used as the custom activation function. Defaults to True.
+        
+        Returns:
+            None
+        
+        Example:
+            ```python
+            dwconv = DWConv(32, 64, 3, 1, 1, True)
+            input_tensor = torch.rand(1, 32, 224, 224)  # Example input tensor with shape (N, C1, H, W)
+            output_tensor = dwconv(input_tensor)  # Output tensor with shape (N, C2, H_out, W_out)
+            ```
         """
         super().__init__(c1, c2, k, s, g=math.gcd(c1, c2), d=d, act=act)
 
@@ -102,8 +201,25 @@ class DWConv(Conv):
 class DWConvTranspose2d(nn.ConvTranspose2d):
     # Depth-wise transpose convolution
     def __init__(self, c1, c2, k=1, s=1, p1=0, p2=0):
-        """Initializes a depth-wise transpose convolutional layer for YOLOv5; args: input channels (c1), output channels
-        (c2), kernel size (k), stride (s), input padding (p1), output padding (p2).
+        """
+        Initialize a depth-wise transpose convolutional layer.
+        
+        Args:
+            c1 (int): Number of input channels.
+            c2 (int): Number of output channels.
+            k (int): Kernel size of the transpose convolution. Default is 1.
+            s (int): Stride of the transpose convolution. Default is 1.
+            p1 (int): Input padding for the transpose convolution. Default is 0.
+            p2 (int): Output padding for the transpose convolution. Default is 0.
+        
+        Returns:
+            None
+        
+        Example:
+            ```python
+            layer = DWConvTranspose2d(64, 128, 3, 2, 1, 1)
+            output = layer(torch.randn(1, 64, 32, 32))  # Example input tensor
+            ```
         """
         super().__init__(c1, c2, k, s, p1, p2, groups=math.gcd(c1, c2))
 
@@ -112,9 +228,21 @@ class TransformerLayer(nn.Module):
     # Transformer layer https://arxiv.org/abs/2010.11929 (LayerNorm layers removed for better performance)
     def __init__(self, c, num_heads):
         """
-        Initializes a transformer layer, sans LayerNorm for performance, with multihead attention and linear layers.
-
-        See  as described in https://arxiv.org/abs/2010.11929.
+        Initialize a transformer layer without LayerNorm for improved performance.
+        
+        Args:
+            c (int): Number of input and output channels for the transformer layer.
+            num_heads (int): Number of attention heads in the multihead attention mechanism.
+        
+        Returns:
+            None
+        
+        Example:
+            ```python
+            layer = TransformerLayer(c=512, num_heads=8)
+            input_tensor = torch.rand(10, 32, 512)  # (sequence_length, batch_size, embedding_dim)
+            output = layer(input_tensor)
+            ```
         """
         super().__init__()
         self.q = nn.Linear(c, c, bias=False)
@@ -125,7 +253,26 @@ class TransformerLayer(nn.Module):
         self.fc2 = nn.Linear(c, c, bias=False)
 
     def forward(self, x):
-        """Performs forward pass using MultiheadAttention and two linear transformations with residual connections."""
+        """
+        Perform forward pass with multihead attention and linear layers using residual connections.
+        
+        Args:
+            x (torch.Tensor): Input tensor of shape (T, N, C) where T is the sequence length, N is the batch size, 
+                and C is the embedding dimension.
+        
+        Returns:
+            (torch.Tensor): Output tensor of shape (T, N, C) matching the input shape.
+        
+        Example:
+            ```python
+            layer = TransformerLayer(c=512, num_heads=8)
+            input_tensor = torch.rand(10, 32, 512)  # Example input tensor with shape (T, N, C)
+            output = layer(input_tensor)  # Output tensor with same shape (T, N, C)
+            ```
+        
+        Note:
+            This implementation removes LayerNorm layers for better computational performance.
+        """
         x = self.ma(self.q(x), self.k(x), self.v(x))[0] + x
         x = self.fc2(self.fc1(x)) + x
         return x
@@ -134,8 +281,27 @@ class TransformerLayer(nn.Module):
 class TransformerBlock(nn.Module):
     # Vision Transformer https://arxiv.org/abs/2010.11929
     def __init__(self, c1, c2, num_heads, num_layers):
-        """Initializes a Transformer block for vision tasks, adapting dimensions if necessary and stacking specified
-        layers.
+        """
+        Initialize a Transformer block for vision tasks, adapting dimensions and stacking specified layers.
+        
+        Args:
+            c1 (int): Number of input channels.
+            c2 (int): Number of output channels.
+            num_heads (int): Number of attention heads in each transformer layer.
+            num_layers (int): Number of transformer layers to stack.
+        
+        Returns:
+            None
+        
+        Example:
+            ```python
+            transformer_block = TransformerBlock(c1=64, c2=128, num_heads=8, num_layers=6)
+            ```
+        
+        Note:
+            This implementation adapts to input dimension changes by including an initial convolution layer if required.
+            Utilizes multi-head self-attention mechanism as described in the paper:
+            https://arxiv.org/abs/2010.11929.
         """
         super().__init__()
         self.conv = None
@@ -146,8 +312,27 @@ class TransformerBlock(nn.Module):
         self.c2 = c2
 
     def forward(self, x):
-        """Processes input through an optional convolution, followed by Transformer layers and position embeddings for
-        object detection.
+        """
+        Perform forward pass through the Vision Transformer block.
+        
+        Args:
+            x (torch.Tensor): Input tensor with shape (B, C1, W, H) where B is batch size, C1 is number of input channels,
+                W is width and H is height.
+        
+        Returns:
+            (torch.Tensor): Output tensor with shape (L, B, C2) after processing through Vision Transformer layers, where
+                L is sequence length, B is batch size, and C2 is number of output channels.
+        
+        Example:
+            ```python
+            transformer_block = TransformerBlock(c1=3, c2=64, num_heads=8, num_layers=6)
+            input_tensor = torch.rand(1, 3, 224, 224)  # Example input tensor of shape (B, C1, W, H)
+            output_tensor = transformer_block(input_tensor)
+            print(output_tensor.shape)  # Will output torch.Size([L, B, C2])
+            ```
+        
+        Note:
+            Ensure the input tensor has the correct shape (B, C1, W, H) and dimensions when using this Transformer block.
         """
         if self.conv is not None:
             x = self.conv(x)
@@ -159,8 +344,34 @@ class TransformerBlock(nn.Module):
 class Bottleneck(nn.Module):
     # Standard bottleneck
     def __init__(self, c1, c2, shortcut=True, g=1, e=0.5):
-        """Initializes a standard bottleneck layer with optional shortcut and group convolution, supporting channel
-        expansion.
+        """
+        Initialize a standard bottleneck layer.
+        
+        This layer consists of a sequence of convolution operations optionally followed by a shortcut connection. The bottleneck design helps in reducing the number of parameters while preserving the performance through embedding dimensionality reduction and restoration.
+        
+        Args:
+            c1 (int): Number of input channels.
+            c2 (int): Number of output channels.
+            shortcut (bool): Whether to add a shortcut connection. Defaults to True.
+            g (int): Number of groups for group convolution. Defaults to 1.
+            e (float): Expansion ratio for hidden layer dimensionality. Defaults to 0.5.
+        
+        Returns:
+            (None): This function does not return any value.
+        
+        Example:
+            ```python
+            bottleneck_layer = Bottleneck(64, 128, shortcut=True, g=1, e=0.5)
+            input_tensor = torch.randn(1, 64, 128, 128)  # Example input tensor with shape (N, C1, H, W)
+            output_tensor = bottleneck_layer(input_tensor)  # Output tensor with shape (N, C2, H, W)
+            ```
+        
+        Note:
+            Ensure the input tensor to the Bottleneck layer has the correct shape (N, C1, H, W) where:
+                N is the batch size,
+                C1 is the number of input channels,
+                H is the height,
+                W is the width.
         """
         super().__init__()
         c_ = int(c2 * e)  # hidden channels
@@ -169,8 +380,25 @@ class Bottleneck(nn.Module):
         self.add = shortcut and c1 == c2
 
     def forward(self, x):
-        """Processes input through two convolutions, optionally adds shortcut if channel dimensions match; input is a
-        tensor.
+        """
+        Perform a forward pass through the bottleneck layer with optional shortcut connection.
+        
+        Args:
+            x (torch.Tensor): Input tensor with shape (..., C_in, H, W), where C_in is the number of input channels.
+        
+        Returns:
+            (torch.Tensor): Output tensor, with shape (..., C_out, H, W) where C_out is the number of output channels, either including
+                the shortcut connection if applicable.
+        
+        Example:
+            ```python
+            import torch
+            from ultralytics.models.common import Bottleneck
+        
+            bottleneck = Bottleneck(c1=64, c2=64)
+            x = torch.randn(1, 64, 144, 144)  # Sample input
+            y = bottleneck(x)  # Forward pass
+            ```
         """
         return x + self.cv2(self.cv1(x)) if self.add else self.cv2(self.cv1(x))
 
@@ -178,8 +406,35 @@ class Bottleneck(nn.Module):
 class BottleneckCSP(nn.Module):
     # CSP Bottleneck https://github.com/WongKinYiu/CrossStagePartialNetworks
     def __init__(self, c1, c2, n=1, shortcut=True, g=1, e=0.5):
-        """Initializes CSP bottleneck with optional shortcuts; args: ch_in, ch_out, number of repeats, shortcut bool,
-        groups, expansion.
+        """
+        Initialize the CSP Bottleneck layer, which is an extension of the traditional bottleneck layer to leverage cross-stage
+        partial connections.
+        
+        Args:
+            c1 (int): Number of input channels.
+            c2 (int): Number of output channels.
+            n (int): Number of times the bottleneck layer is repeated. Default is 1.
+            shortcut (bool): Whether to use shortcut connections in the bottleneck layers. Default is True.
+            g (int): Number of groups for the grouped convolution in the bottleneck layers. Default is 1.
+            e (float): Expansion factor to control the hidden channels in the bottleneck layers. Default is 0.5.
+        
+        Returns:
+            (None): Initializes the parameters for the CSP Bottleneck module.
+        
+        Example:
+            ```python
+            from ultralytics.models.common import BottleneckCSP
+        
+            # Instantiate CSPBottleneck with specific configuration
+            bottleneck_csp = BottleneckCSP(c1=64, c2=128, n=3, shortcut=True, g=2, e=0.5)
+        
+            # Example input tensor
+            input_tensor = torch.randn(1, 64, 128, 128)  # Shape (B, C1, H, W)
+        
+            # Forward pass through the layer
+            output_tensor = bottleneck_csp(input_tensor)
+            print(output_tensor.shape)  # Should match expected output shape
+            ```
         """
         super().__init__()
         c_ = int(c2 * e)  # hidden channels
@@ -192,8 +447,31 @@ class BottleneckCSP(nn.Module):
         self.m = nn.Sequential(*(Bottleneck(c_, c_, shortcut, g, e=1.0) for _ in range(n)))
 
     def forward(self, x):
-        """Performs forward pass by applying layers, activation, and concatenation on input x, returning feature-
-        enhanced output.
+        """
+        Perform a forward pass through the CSP (Cross Stage Partial) Bottleneck layer.
+        
+        Args:
+            x (torch.Tensor): Input tensor with shape (N, C, H, W) where N is the batch size, C is the number of channels,
+                H is the height, and W is the width.
+        
+        Returns:
+            (torch.Tensor): Output tensor after applying CSP bottleneck transformations, with shape (N, C2, H, W), where C2 is
+                the output channel size specified during initialization.
+        
+        Example:
+            ```python
+            import torch
+            from ultralytics.models.common import BottleneckCSP
+        
+            model = BottleneckCSP(c1=64, c2=128, n=1)
+            x = torch.randn(1, 64, 128, 128)
+            output = model.forward(x)
+            ```
+        
+        Note:
+            CSP Bottleneck architecture helps in reducing the amount of computation as well as mitigating the gradient
+            vanishing problem in deep neural networks. The specific implementation follows the design principles outlined in
+            https://github.com/WongKinYiu/CrossStagePartialNetworks.
         """
         y1 = self.cv3(self.m(self.cv1(x)))
         y2 = self.cv2(x)
@@ -204,10 +482,30 @@ class CrossConv(nn.Module):
     # Cross Convolution Downsample
     def __init__(self, c1, c2, k=3, s=1, g=1, e=1.0, shortcut=False):
         """
-        Initializes CrossConv with downsampling, expanding, and optionally shortcutting; `c1` input, `c2` output
-        channels.
-
-        Inputs are ch_in, ch_out, kernel, stride, groups, expansion, shortcut.
+        Perform initialization of the CrossConv module, which combines convolutions with optional downsampling.
+        
+        Args:
+            c1 (int): Number of input channels.
+            c2 (int): Number of output channels.
+            k (int): Kernel size for the convolution. Defaults to 3.
+            s (int): Stride for the convolution. Defaults to 1.
+            g (int): Number of groups for the grouped convolution. Defaults to 1.
+            e (float): Expansion factor for the intermediate channels. Defaults to 1.0.
+            shortcut (bool): If True, includes a shortcut connection. Defaults to False.
+        
+        Returns:
+            (None): This method initializes the CrossConv instance without returning any value.
+        
+        Note:
+            This module is designed for channel expansion and downsampling operations within neural network architectures, 
+            particularly for YOLOv5.
+        
+        Example:
+            ```python
+            cross_conv = CrossConv(64, 128)
+            input_tensor = torch.randn(1, 64, 224, 224)
+            output = cross_conv(input_tensor)
+            ```
         """
         super().__init__()
         c_ = int(c2 * e)  # hidden channels
@@ -216,15 +514,62 @@ class CrossConv(nn.Module):
         self.add = shortcut and c1 == c2
 
     def forward(self, x):
-        """Performs feature sampling, expanding, and applies shortcut if channels match; expects `x` input tensor."""
+        """
+        Perform feature downsampling, expansion, and optional shortcut connection in a neural network.
+        
+        Args:
+            x (torch.Tensor): Input tensor with shape (N, C, H, W) where N is the batch size, C is the number of channels,
+                H is the height, and W is the width.
+        
+        Returns:
+            (torch.Tensor): Output tensor with the same shape as the input, transformed through the cross convolution layers.
+        
+        Example:
+            ```python
+            import torch
+            from ultralytics.models.common import CrossConv
+        
+            cross_conv = CrossConv(64, 128)
+            input_tensor = torch.randn(1, 64, 224, 224)
+            output_tensor = cross_conv(input_tensor)
+            print(output_tensor.shape)  # Output tensor shape
+            ```
+        
+        Note:
+            CrossConv layers are used in models to effectively downsample and expand feature maps, aiding in feature extraction
+            while maintaining computational efficiency.
+        """
         return x + self.cv2(self.cv1(x)) if self.add else self.cv2(self.cv1(x))
 
 
 class C3(nn.Module):
     # CSP Bottleneck with 3 convolutions
     def __init__(self, c1, c2, n=1, shortcut=True, g=1, e=0.5):
-        """Initializes C3 module with options for channel count, bottleneck repetition, shortcut usage, group
-        convolutions, and expansion.
+        """
+        Initialize a CSP bottleneck containing three convolutional layers and optional shortcut connections.
+        
+        Args:
+            c1 (int): Number of input channels.
+            c2 (int): Number of output channels.
+            n (int): Number of Bottleneck layers to include. Defaults to 1.
+            shortcut (bool): Whether to use shortcut connections in the Bottleneck layers. Defaults to True.
+            g (int): Number of groups for the grouped convolution. Defaults to 1.
+            e (float): Expansion ratio for the hidden channels in the Bottleneck layers. Defaults to 0.5.
+        
+        Returns:
+            (torch.Tensor): The output tensor from the sequential layers, maintaining the same spatial dimensions but potentially 
+            different channel dimensions.
+        
+        Example:
+            ```python
+            from ultralytics.models.common import C3
+            import torch
+        
+            c3_layer = C3(c1=128, c2=256, n=1, shortcut=True)
+            x = torch.randn(1, 128, 32, 32)  # Example input tensor
+            y = c3_layer(x)  # Output tensor
+            print(y.shape)  # torch.Size([1, 256, 32, 32])
+            ```
         """
         super().__init__()
         c_ = int(c2 * e)  # hidden channels
@@ -234,15 +579,63 @@ class C3(nn.Module):
         self.m = nn.Sequential(*(Bottleneck(c_, c_, shortcut, g, e=1.0) for _ in range(n)))
 
     def forward(self, x):
-        """Performs forward propagation using concatenated outputs from two convolutions and a Bottleneck sequence."""
+        """
+        Performs a forward pass using CSP bottleneck with three convolution layers, incorporating hidden bottleneck layers.
+        
+        Args:
+            x (torch.Tensor): Input tensor with shape (N, C, H, W), where N is batch size, C is number of input channels, 
+                H is height, and W is width.
+        
+        Returns:
+            (torch.Tensor): Output tensor with shape (N, C_out, H, W), where C_out is the number of output channels after
+                processing through the CSP bottleneck with 3 convolutions.
+        
+        Example:
+            ```python
+            import torch
+            from ultralytics.models.common import C3
+        
+            model = C3(c1=64, c2=128, n=3)
+            input_tensor = torch.randn(1, 64, 128, 128)  # Example input tensor with shape (N, C, H, W)
+            output_tensor = model(input_tensor)
+            print(output_tensor.shape)  # Outputs tensor shape after forward pass
+            ```
+        
+        Note:
+            CSP Bottleneck with 3 convolutions and hidden bottleneck layers helps in efficient representation by downsampling
+            and concatenating filtered features from different paths. This architecture is inspired by the principles outlined
+            in https://github.com/WongKinYiu/CrossStagePartialNetworks.
+        """
         return self.cv3(torch.cat((self.m(self.cv1(x)), self.cv2(x)), 1))
 
 
 class C3x(C3):
     # C3 module with cross-convolutions
     def __init__(self, c1, c2, n=1, shortcut=True, g=1, e=0.5):
-        """Initializes C3x module with cross-convolutions, extending C3 with customizable channel dimensions, groups,
-        and expansion.
+        """
+        Initialize the C3x module with cross-convolutions.
+        
+        Args:
+            c1 (int): Number of input channels.
+            c2 (int): Number of output channels.
+            n (int): Number of Bottleneck layers to include. Defaults to 1.
+            shortcut (bool): Whether to add shortcut connections. Defaults to True.
+            g (int): Number of groups for grouped convolution. Defaults to 1.
+            e (float): Expansion ratio for hidden channels. Defaults to 0.5.
+        
+        Returns:
+            None: This constructor initializes the C3x module with the specified parameters and does not return any value.
+        
+        Note:
+            This class inherits from C3 and extends its functionality by adding cross-convolutions adjacent to the main
+            bottleneck layers for enhanced feature extraction.
+        
+        Example:
+            ```python
+            c3x_layer = C3x(64, 128, n=3, shortcut=True, g=1, e=0.5)
+            input_tensor = torch.randn(1, 64, 256, 256)  # Example input tensor with shape (N, C1, H, W)
+            output_tensor = c3x_layer(input_tensor)  # Output tensor with shape (N, C2, H, W)
+            ```
         """
         super().__init__(c1, c2, n, shortcut, g, e)
         c_ = int(c2 * e)
@@ -252,8 +645,30 @@ class C3x(C3):
 class C3TR(C3):
     # C3 module with TransformerBlock()
     def __init__(self, c1, c2, n=1, shortcut=True, g=1, e=0.5):
-        """Initializes C3 module with TransformerBlock for enhanced feature extraction, accepts channel sizes, shortcut
-        config, group, and expansion.
+        """
+        Initialize the C3 module with an integrated TransformerBlock for advanced feature extraction.
+        
+        Args:
+            c1 (int): Number of input channels.
+            c2 (int): Number of output channels.
+            n (int): Number of Bottleneck layers to be stacked sequentially.
+            shortcut (bool): Whether to use residual connections between layers.
+            g (int): Number of groups in group convolution.
+            e (float): Expansion coefficient for channel dimensions.
+        
+        Returns:
+            None
+        
+        Example:
+            ```python
+            c3tr = C3TR(64, 128, n=3, shortcut=True, g=1, e=0.5)
+            input_tensor = torch.rand(1, 64, 256, 256)  # Random input tensor with shape (B, C1, H, W)
+            output_tensor = c3tr(input_tensor)
+            ```
+        
+        Notes:
+            This module extends C3 by incorporating a TransformerBlock for enhanced contextual feature extraction, as described in
+            the paper "Attention Is All You Need" (https://arxiv.org/abs/2010.11929).
         """
         super().__init__(c1, c2, n, shortcut, g, e)
         c_ = int(c2 * e)
@@ -263,8 +678,31 @@ class C3TR(C3):
 class C3SPP(C3):
     # C3 module with SPP()
     def __init__(self, c1, c2, k=(5, 9, 13), n=1, shortcut=True, g=1, e=0.5):
-        """Initializes a C3 module with SPP layer for advanced spatial feature extraction, given channel sizes, kernel
-        sizes, shortcut, group, and expansion ratio.
+        """
+        Initialize a C3 module with Spatial Pyramid Pooling (SPP) for advanced spatial feature extraction.
+        
+        Args:
+            c1 (int): Number of input channels.
+            c2 (int): Number of output channels.
+            k (tuple[int]): Kernel sizes for SPP. Defaults to (5, 9, 13).
+            n (int, optional): Number of Bottleneck layers. Defaults to 1.
+            shortcut (bool, optional): Whether to use residual connections. Defaults to True.
+            g (int, optional): Number of groups for group convolution. Defaults to 1.
+            e (float, optional): Expansion ratio for hidden channels. Defaults to 0.5.
+        
+        Returns:
+            None
+        
+        Example:
+            ```python
+            c3spp = C3SPP(c1=64, c2=128, k=(5, 9, 13), n=1, shortcut=True, g=1, e=0.5)
+            input_tensor = torch.randn(1, 64, 32, 32)  # Batch size 1, 64 channels, 32x32 image
+            output = c3spp(input_tensor)
+            print(output.shape)  # Expected output shape: (1, 128, 32, 32)
+            ```
+        
+        Note:
+            The SPP layer enhances the receptive field size while keeping computational costs manageable.
         """
         super().__init__(c1, c2, n, shortcut, g, e)
         c_ = int(c2 * e)
@@ -274,7 +712,29 @@ class C3SPP(C3):
 class C3Ghost(C3):
     # C3 module with GhostBottleneck()
     def __init__(self, c1, c2, n=1, shortcut=True, g=1, e=0.5):
-        """Initializes YOLOv5's C3 module with Ghost Bottlenecks for efficient feature extraction."""
+        """
+        Initializes YOLOv5's C3 module using Ghost Bottlenecks for efficient feature extraction and processing.
+        
+        Args:
+            c1 (int): Number of input channels.
+            c2 (int): Number of output channels.
+            n (int, optional): Number of Bottleneck layers to include. Defaults to 1.
+            shortcut (bool, optional): Whether to add shortcut connections. Defaults to True.
+            g (int, optional): Number of groups for group convolution. Defaults to 1.
+            e (float, optional): Expansion ratio for hidden channels. Defaults to 0.5.
+        
+        Returns:
+            None
+        
+        Example:
+            ```python
+            from ultralytics.models.common import C3Ghost
+            
+            c3ghost_layer = C3Ghost(c1=64, c2=128, n=2, shortcut=True, e=0.5)
+            input_tensor = torch.randn(1, 64, 256, 256)  # Random input tensor with shape (B, C1, H, W)
+            output_tensor = c3ghost_layer(input_tensor)
+            ```
+        """
         super().__init__(c1, c2, n, shortcut, g, e)
         c_ = int(c2 * e)  # hidden channels
         self.m = nn.Sequential(*(GhostBottleneck(c_, c_) for _ in range(n)))
@@ -283,7 +743,30 @@ class C3Ghost(C3):
 class SPP(nn.Module):
     # Spatial Pyramid Pooling (SPP) layer https://arxiv.org/abs/1406.4729
     def __init__(self, c1, c2, k=(5, 9, 13)):
-        """Initializes SPP layer with Spatial Pyramid Pooling, ref: https://arxiv.org/abs/1406.4729, args: c1 (input channels), c2 (output channels), k (kernel sizes)."""
+        """
+        Initialize the Spatial Pyramid Pooling (SPP) layer to enhance receptive field size and feature extraction.
+        
+        Args:
+            c1 (int): Number of input channels.
+            c2 (int): Number of output channels.
+            k (tuple[int], optional): Kernel sizes for max pooling layers. Default is (5, 9, 13).
+        
+        Returns:
+            None
+        
+        Example:
+            ```python
+            spp_layer = SPP(c1=64, c2=128, k=(5, 9, 13))
+            input_tensor = torch.randn(1, 64, 32, 32)  # Batch size 1, 64 channels, 32x32 resolution
+            output_tensor = spp_layer(input_tensor)
+            print(output_tensor.shape)  # Output shape: (1, 128, 32, 32)
+            ```
+        
+        Note:
+            The SPP layer facilitates effective extraction of multi-scale context by performing max pooling
+            with multiple kernel sizes. This enhances the network's receptive field and robustness to object
+            scaling and deformation.
+        """
         super().__init__()
         c_ = c1 // 2  # hidden channels
         self.cv1 = Conv(c1, c_, 1, 1)
@@ -291,8 +774,23 @@ class SPP(nn.Module):
         self.m = nn.ModuleList([nn.MaxPool2d(kernel_size=x, stride=1, padding=x // 2) for x in k])
 
     def forward(self, x):
-        """Applies convolution and max pooling layers to the input tensor `x`, concatenates results, and returns output
-        tensor.
+        """
+        Apply the Spatial Pyramid Pooling (SPP) process to enhance spatial feature extraction from input tensor.
+        
+        Args:
+            x (torch.Tensor): Input tensor with shape (N, C, H, W), where N is batch size, C is the number of channels,
+                H is the height, and W is the width.
+        
+        Returns:
+            (torch.Tensor): Output tensor with enhanced spatial features, having shape (N, C2, H, W).
+        
+        Example:
+            ```python
+            spp_layer = SPP(c1=64, c2=128, k=(5, 9, 13))
+            input_tensor = torch.randn(1, 64, 32, 32)  # Batch size 1, 64 channels, 32x32 spatial dimensions
+            output_tensor = spp_layer(input_tensor)
+            print(output_tensor.shape)  # Expected output shape: (1, 128, 32, 32)
+            ```
         """
         x = self.cv1(x)
         with warnings.catch_warnings():
@@ -304,10 +802,27 @@ class SPPF(nn.Module):
     # Spatial Pyramid Pooling - Fast (SPPF) layer for YOLOv5 by Glenn Jocher
     def __init__(self, c1, c2, k=5):
         """
-        Initializes YOLOv5 SPPF layer with given channels and kernel size for YOLOv5 model, combining convolution and
-        max pooling.
-
-        Equivalent to SPP(k=(5, 9, 13)).
+        Initialize YOLOv5's Spatial Pyramid Pooling - Fast (SPPF) layer with convolution and max pooling.
+        
+        Args:
+            c1 (int): Number of input channels.
+            c2 (int): Number of output channels.
+            k (int, optional): Kernel size for max pooling layers. Default is 5.
+        
+        Returns:
+            None: This method initializes the SPPF layer without returning any value.
+        
+        Example:
+            ```python
+            sppf = SPPF(128, 256, k=5)
+            input_tensor = torch.randn(1, 128, 64, 64)
+            output_tensor = sppf(input_tensor)
+            print(output_tensor.shape)  # Expected output shape: (1, 256, 64, 64)
+            ```
+        
+        Note:
+            SPPF enhances feature extraction efficiency by reducing spatial dimensions and enriching features using convolution and
+            max pooling.
         """
         super().__init__()
         c_ = c1 // 2  # hidden channels
@@ -316,7 +831,29 @@ class SPPF(nn.Module):
         self.m = nn.MaxPool2d(kernel_size=k, stride=1, padding=k // 2)
 
     def forward(self, x):
-        """Processes input through a series of convolutions and max pooling operations for feature extraction."""
+        """
+        Perform forward pass through the Spatial Pyramid Pooling-Fast (SPPF) layer to enhance spatial features.
+        
+        Args:
+            x (torch.Tensor): Input tensor with shape (N, C1, H, W), where N is batch size, C1 is number of input channels,
+                H is height, and W is width.
+        
+        Returns:
+            (torch.Tensor): Output tensor with enriched spatial features and shape (N, C2, H, W), where C2 is the number of
+                output channels.
+        
+        Example:
+            ```python
+            sppf = SPPF(128, 256, k=5)
+            input_tensor = torch.randn(1, 128, 64, 64)  # Example input tensor
+            output_tensor = sppf(input_tensor)
+            print(output_tensor.shape)  # Expected output shape: (1, 256, 64, 64)
+            ```
+        
+        Note:
+            The SPPF layer leverages multiple levels of max pooling to capture diverse spatial patterns efficiently, which is
+            particularly useful in object detection tasks like YOLOv5.
+        """
         x = self.cv1(x)
         with warnings.catch_warnings():
             warnings.simplefilter("ignore")  # suppress torch 1.9.0 max_pool2d() warning
@@ -328,15 +865,61 @@ class SPPF(nn.Module):
 class Focus(nn.Module):
     # Focus wh information into c-space
     def __init__(self, c1, c2, k=1, s=1, p=None, g=1, act=True):
-        """Initializes Focus module to concentrate width-height info into channel space with configurable convolution
-        parameters.
+        """
+        Initialize the Focus layer that concatenates slices of the input tensor to increase channel depth before applying convolution.
+        
+        Args:
+            c1 (int): Number of input channels.
+            c2 (int): Number of output channels.
+            k (int): Kernel size for the convolution. Defaults to 1.
+            s (int): Stride for the convolution. Defaults to 1.
+            p (int | None): Padding size for the convolution. Uses automatic padding if None. Defaults to None.
+            g (int): Group size for the convolution. Defaults to 1.
+            act (bool | nn.Module): Activation function to apply after the convolution. Uses default activation (nn.SiLU) if True,
+                or no activation if False. Can also be a custom activation module.
+        
+        Returns:
+            None: This is an initializer method, so it does not return a value.
+        
+        Example:
+            ```python
+            focus = Focus(3, 64, k=3, s=1, p=1)
+            input_tensor = torch.rand(1, 3, 224, 224)
+            output = focus(input_tensor)
+            ```
+        Notes:
+            The Focus layer is designed to increase the channel dimension by concatenating four slices of the input tensor,
+            then applying a convolution to the concatenated result.
         """
         super().__init__()
         self.conv = Conv(c1 * 4, c2, k, s, p, g, act=act)
         # self.contract = Contract(gain=2)
 
     def forward(self, x):
-        """Processes input through Focus mechanism, reshaping (b,c,w,h) to (b,4c,w/2,h/2) then applies convolution."""
+        """
+        Focus width and height information into channel space and apply convolution.
+        
+        Args:
+            x (torch.Tensor): Input tensor with shape (N, C1, H, W) where N is the batch size, C1 is the number of input 
+                channels, H is the height, and W is the width.
+        
+        Returns:
+            (torch.Tensor): Output tensor with shape (N, C2, H/2, W/2) where C2 is the number of output channels specified
+                during initialization.
+        
+        Example:
+            ```python
+            focus_layer = Focus(3, 64, k=3, s=1, p=1)
+            input_tensor = torch.rand(1, 3, 224, 224)
+            output_tensor = focus_layer(input_tensor)
+            print(output_tensor.shape)  # Expected shape: (1, 64, 112, 112)
+            ```
+        
+        Notes:
+            The Focus layer increases the channel dimension by concatenating four slices of the input tensor, each slice being 
+            a downsampled version of the input. This effectively focuses width and height information into the channel space before 
+            applying convolution.
+        """
         return self.conv(torch.cat((x[..., ::2, ::2], x[..., 1::2, ::2], x[..., ::2, 1::2], x[..., 1::2, 1::2]), 1))
         # return self.conv(self.contract(x))
 
@@ -344,8 +927,36 @@ class Focus(nn.Module):
 class GhostConv(nn.Module):
     # Ghost Convolution https://github.com/huawei-noah/ghostnet
     def __init__(self, c1, c2, k=1, s=1, g=1, act=True):
-        """Initializes GhostConv with in/out channels, kernel size, stride, groups, and activation; halves out channels
-        for efficiency.
+        """
+        Initialize a Ghost Convolution layer for efficient feature extraction using fewer parameters.
+        
+        Args:
+            c1 (int): Number of input channels.
+            c2 (int): Number of output channels.
+            k (int, optional): Kernel size for convolution. Default is 1.
+            s (int, optional): Stride for convolution. Default is 1.
+            g (int, optional): Number of groups for convolution, facilitating group-wise operations. Default is 1.
+            act (bool | nn.Module, optional): Activation function to use. Default is True, which uses the default activation;
+                can also accept an nn.Module for custom activation or False for no activation.
+        
+        Returns:
+            (None): This method initializes the Ghost Convolution layer without returning any value.
+        
+        Example:
+            ```python
+            import torch
+            from ultralytics.models.common import GhostConv
+        
+            x = torch.randn(1, 64, 128, 128)  # Example input tensor with shape (B, C1, H, W)
+            conv_layer = GhostConv(64, 128)  # Initialize GhostConv with 64 input channels and 128 output channels
+            y = conv_layer(x)  # Forward pass
+            print(y.shape)  # Should output: torch.Size([1, 128, 128, 128])
+            ```
+            
+        Note:
+            The Ghost Convolution technique effectively reduces computational complexity by splitting convolution into two steps:
+            a primary convolution and a series of cheaper operations to generate 'ghost' feature maps. The technique is published
+            by Huawei Noah's Ark Lab and is aimed at optimizing neural network performance on edge devices.
         """
         super().__init__()
         c_ = c2 // 2  # hidden channels
@@ -353,7 +964,32 @@ class GhostConv(nn.Module):
         self.cv2 = Conv(c_, c_, 5, 1, None, c_, act=act)
 
     def forward(self, x):
-        """Performs forward pass, concatenating outputs of two convolutions on input `x`: shape (B,C,H,W)."""
+        """
+        Perform a forward pass through the Ghost Convolution layer.
+        
+        Args:
+            x (torch.Tensor): Input tensor with shape (N, C1, H, W), where N is the batch size, C1 is the number of input 
+                channels, H is the height, and W is the width.
+        
+        Returns:
+            (torch.Tensor): Output tensor with shape (N, C2, H, W), where C2 is the number of output channels after
+                applying Ghost Convolution operations.
+        
+        Example:
+            ```python
+            import torch
+            from ultralytics.models.common import GhostConv
+        
+            input_tensor = torch.randn(1, 64, 128, 128)  # Example input tensor with shape (B, C1, H, W)
+            ghost_conv_layer = GhostConv(64, 128)  # Initialize GhostConv with 64 input channels and 128 output channels
+            output_tensor = ghost_conv_layer.forward(input_tensor)  # Forward pass
+            print(output_tensor.shape)  # Should output: torch.Size([1, 128, 128, 128])
+            ```
+        
+        Note:
+            Ghost Convolution aims to optimize feature extraction by combining standard convolutions with cheaper operations to 
+            generate 'ghost' feature maps, enhancing computational efficiency and performance.
+        """
         y = self.cv1(x)
         return torch.cat((y, self.cv2(y)), 1)
 
@@ -361,7 +997,36 @@ class GhostConv(nn.Module):
 class GhostBottleneck(nn.Module):
     # Ghost Bottleneck https://github.com/huawei-noah/ghostnet
     def __init__(self, c1, c2, k=3, s=1):
-        """Initializes GhostBottleneck with ch_in `c1`, ch_out `c2`, kernel size `k`, stride `s`; see https://github.com/huawei-noah/ghostnet."""
+        """
+        Initialize a GhostBottleneck layer for efficient feature extraction and processing with optional downsampling.
+        
+        Args:
+            c1 (int): Number of input channels.
+            c2 (int): Number of output channels.
+            k (int): Kernel size for depth-wise convolution. Defaults to 3.
+            s (int): Stride for depth-wise convolution, determines downsampling. Defaults to 1.
+        
+        Returns:
+            None
+        
+        Example:
+            ```python
+            from ultralytics.models.common import GhostBottleneck
+            import torch
+        
+            # Initialize GhostBottleneck with 64 input channels, 128 output channels
+            ghost_bottleneck = GhostBottleneck(c1=64, c2=128, k=3, s=2)
+            x = torch.randn(1, 64, 56, 56)  # Example input tensor
+            output = ghost_bottleneck(x)
+            print(output.shape)  # Expected output tensor shape: (1, 128, 28, 28)
+            ```
+        
+        Note:
+            The GhostBottleneck module incorporates GhostConvs and optional depth-wise convolutions for efficient feature
+            processing. The use of GhostConv layers reduces computational overhead while maintaining performance, making
+            this bottleneck design suitable for deploying neural networks on resource-constrained devices. The specific
+            implementation is inspired by the GhostNet architecture: https://github.com/huawei-noah/ghostnet.
+        """
         super().__init__()
         c_ = c2 // 2
         self.conv = nn.Sequential(
@@ -374,22 +1039,76 @@ class GhostBottleneck(nn.Module):
         )
 
     def forward(self, x):
-        """Processes input through conv and shortcut layers, returning their summed output."""
+        """
+        Performs a forward pass through the GhostBottleneck layer, leveraging Ghost convolution operations for efficient feature extraction.
+        
+        Args:
+            x (torch.Tensor): Input tensor with shape (N, C, H, W), where N is the batch size, C is the number of channels,
+                H is the height, and W is the width.
+        
+        Returns:
+            (torch.Tensor): Output tensor with shape (N, C2, H, W) after applying Ghost Convolutions and optional
+                shortcut connections, where C2 is the number of output channels specified during initialization.
+        
+        Example:
+            ```python
+            import torch
+            from ultralytics.models.common import GhostBottleneck
+        
+            # Initialize GhostBottleneck with 64 input and 128 output channels
+            ghost_bottleneck = GhostBottleneck(64, 128)
+            x = torch.randn(1, 64, 56, 56)  # Example input
+            y = ghost_bottleneck(x)  # Forward pass
+            print(y.shape)  # Output shape should be (1, 128, 56, 56)
+            ```
+        
+        Note:
+            This layer is part of the GhostNet architecture, designed for lightweight and efficient neural network models,
+            particularly on edge devices. The architecture minimizes computational complexity by generating fewer primary
+            feature maps and using cheap operations to produce 'ghost' feature maps.
+        """
         return self.conv(x) + self.shortcut(x)
 
 
 class Contract(nn.Module):
     # Contract width-height into channels, i.e. x(1,64,80,80) to x(1,256,40,40)
     def __init__(self, gain=2):
-        """Initializes a layer to contract spatial dimensions (width-height) into channels, e.g., input shape
-        (1,64,80,80) to (1,256,40,40).
+        """
+        Initialize the Contract module for transforming spatial dimensions into channels.
+        
+        Args:
+            gain (int): The factor by which to contract the dimensions. For example, a gain of 2 will halve the spatial
+                dimensions and quadruple the channel dimension.
+        
+        Example:
+            ```python
+            contract_layer = Contract(gain=2)
+            x = torch.randn(1, 64, 80, 80)
+            output = contract_layer(x)  # results in shape (1, 256, 40, 40)
+            ```
         """
         super().__init__()
         self.gain = gain
 
     def forward(self, x):
-        """Processes input tensor to expand channel dimensions by contracting spatial dimensions, yielding output shape
-        `(b, c*s*s, h//s, w//s)`.
+        """
+        Forward pass for contracting the spatial dimensions into the channel dimension.
+        
+        Args:
+            x (torch.Tensor): Input tensor of shape (B, C, H, W) where B is the batch size, C is the number of channels,
+                H is the height, and W is the width.
+        
+        Returns:
+            (torch.Tensor): Tensor with the spatial dimensions contracted into the channel dimension, with shape 
+                (B, C * gain * gain, H // gain, W // gain).
+        
+        Example:
+            ```python
+            contract_layer = Contract(gain=2)
+            input_tensor = torch.randn(1, 64, 80, 80)
+            output_tensor = contract_layer(input_tensor)
+            assert output_tensor.shape == (1, 256, 40, 40)
+            ```
         """
         b, c, h, w = x.size()  # assert (h / s == 0) and (W / s == 0), 'Indivisible gain'
         s = self.gain
@@ -402,17 +1121,46 @@ class Expand(nn.Module):
     # Expand channels into width-height, i.e. x(1,64,80,80) to x(1,16,160,160)
     def __init__(self, gain=2):
         """
-        Initializes the Expand module to increase spatial dimensions by redistributing channels, with an optional gain
-        factor.
-
-        Example: x(1,64,80,80) to x(1,16,160,160).
+        Initialize the Expand module to increase spatial dimensions by redistributing channels.
+        
+        Args:
+            gain (int): Factor to redistribute channels into spatial dimensions. Default is 2.
+        
+        Returns:
+            None
+        
+        Example:
+            ```python
+            expand_layer = Expand(gain=2)
+            input_tensor = torch.randn(1, 64, 80, 80)
+            output_tensor = expand_layer(input_tensor)  # Output shape will be (1, 16, 160, 160)
+            ```
         """
         super().__init__()
         self.gain = gain
 
     def forward(self, x):
-        """Processes input tensor x to expand spatial dimensions by redistributing channels, requiring C / gain^2 ==
-        0.
+        """
+        Expand channels into spatial dimensions, i.e., transforms tensor shape (B, C, H, W) to (B, C/(gain^2), H*gain, W*gain).
+        
+        Args:
+            x (torch.Tensor): Input tensor with shape (B, C, H, W), where B is the batch size, C is the number of channels,
+                H is the height, and W is the width.
+        
+        Returns:
+            (torch.Tensor): Output tensor with expanded spatial dimensions and reshaped channels. For example, an input tensor
+                with shape (B, C, H, W) is transformed into (B, C/(gain^2), H*gain, W*gain), where gain is the expansion factor.
+        
+        Example:
+            ```python
+            expand_layer = Expand(gain=2)
+            input_tensor = torch.rand(1, 64, 80, 80)
+            output_tensor = expand_layer(input_tensor)
+            print(output_tensor.shape)  # Expected output: torch.Size([1, 16, 160, 160])
+            ```
+        
+        Note:
+            Ensure that the number of input channels `C` is divisible by `gain^2` to avoid reshaping errors.
         """
         b, c, h, w = x.size()  # assert C / s ** 2 == 0, 'Indivisible gain'
         s = self.gain
@@ -424,13 +1172,52 @@ class Expand(nn.Module):
 class Concat(nn.Module):
     # Concatenate a list of tensors along dimension
     def __init__(self, dimension=1):
-        """Initializes a Concat module to concatenate tensors along a specified dimension."""
+        """
+        Initializes a Concat module to concatenate tensors along a specified dimension.
+        
+        Args:
+            dimension (int): Dimension along which to concatenate the input tensors. Default is 1.
+        
+        Returns:
+            None: This method initializes the Concat module without returning any value.
+        
+        Example:
+            ```python
+            concat_layer = Concat(dimension=1)
+            input_tensor1 = torch.randn(2, 3, 64, 64)
+            input_tensor2 = torch.randn(2, 3, 64, 64)
+            output_tensor = concat_layer([input_tensor1, input_tensor2])
+            print(output_tensor.shape)  # Expected output shape: (2, 6, 64, 64)
+            ```
+        """
         super().__init__()
         self.d = dimension
 
     def forward(self, x):
-        """Concatenates a list of tensors along a specified dimension; `x` is a list of tensors, `dimension` is an
-        int.
+        """
+        Concatenate a list of tensors along a specified dimension.
+        
+        Args:
+            x (list[torch.Tensor]): A list of tensors to concatenate along the specified dimension. Each tensor
+                must have the same shape except along the concatenation dimension.
+        
+        Returns:
+            (torch.Tensor): The concatenated tensor along the specified dimension.
+        
+        Example:
+            ```python
+            import torch
+        
+            t1 = torch.randn(2, 3)
+            t2 = torch.randn(2, 3)
+            concat_module = Concat(dimension=0)
+            result = concat_module([t1, t2])
+            print(result.shape)  # Output shape will be (4, 3)
+            ```
+        
+        Note:
+            The concatenation dimension is specified during the initialization of the Concat module. Ensure that
+            all tensors to be concatenated have matching shapes except along the dimension specified.
         """
         return torch.cat(x, self.d)
 
@@ -438,7 +1225,35 @@ class Concat(nn.Module):
 class DetectMultiBackend(nn.Module):
     # YOLOv5 MultiBackend class for python inference on various backends
     def __init__(self, weights="yolov5s.pt", device=torch.device("cpu"), dnn=False, data=None, fp16=False, fuse=True):
-        """Initializes DetectMultiBackend with support for various inference backends, including PyTorch and ONNX."""
+        """
+        Initialize the DetectMultiBackend class for inference on multiple backends such as PyTorch, ONNX, TensorRT, and more.
+        
+        Args:
+            weights (str | list[str]): Path to the model weights. Multiple weights can be specified as a list for ensemble.
+                Supported extensions include .pt, .onnx, .torchscript, .xml, .engine, .mlmodel, .pb, .tflite, and more.
+            device (torch.device): The device to run the model on, e.g., torch.device('cpu'), torch.device('cuda:0'). 
+                Default is torch.device('cpu').
+            dnn (bool): Flag to use OpenCV DNN for ONNX models. Default is False.
+            data (str | None): Path to the dataset configuration file containing class names. If None, default names will be used. 
+                Default is None.
+            fp16 (bool): Flag to enable half-precision FP16 inference. Default is False.
+            fuse (bool): Flag to fuse model convolutions for improved runtime efficiency. Default is True.
+        
+        Returns:
+            None
+        
+        Example:
+            ```python
+            from ultralytics import DetectMultiBackend
+        
+            model = DetectMultiBackend(weights='yolov5s.pt', device=torch.device('cuda:0'))
+            ```
+        
+        Note:
+            - Successfully supports multiple backends such as PyTorch, ONNX, TensorRT, OpenCV DNN, PaddlePaddle, TensorFlow, and more.
+            - Ensure that appropriate dependency packages for various backends are installed as required.
+            - Utilizes efficient pre-initializations and backend-specific optimizations defined within the `__init__` method to support diverse methods of model loading and inference.
+        """
         #   PyTorch:              weights = *.pt
         #   TorchScript:                    *.torchscript
         #   ONNX Runtime:                   *.onnx
@@ -655,7 +1470,39 @@ class DetectMultiBackend(nn.Module):
         self.__dict__.update(locals())  # assign all variables to self
 
     def forward(self, im, augment=False, visualize=False):
-        """Performs YOLOv5 inference on input images with options for augmentation and visualization."""
+        """
+        Performs inference on input images with support for multiple backends (PyTorch, ONNX, TensorRT, etc.).
+        
+        Args:
+            im (torch.Tensor): Input tensor containing images, with shape (B, C, H, W) where B is batch size, C is number of 
+                channels, H is height, and W is width.
+            augment (bool): Boolean flag to perform data augmentation during inference. Defaults to False.
+            visualize (bool): Boolean flag to store or visualize the features/activations. Defaults to False.
+        
+        Returns:
+            (torch.Tensor): Inference output tensor. Depending on the backend, this can be a single torch.Tensor or a list of 
+            torch.Tensors. Each tensor contains detection results such as bounding boxes and class scores.
+        
+        Example:
+            ```python
+            import torch
+            from ultralytics.models.common import DetectMultiBackend
+        
+            # Initialize the model for a specific backend
+            model = DetectMultiBackend(weights='yolov5s.pt', device=torch.device('cpu'))
+        
+            # Example input tensor of shape (B, C, H, W)
+            input_tensor = torch.randn(1, 3, 640, 640)
+        
+            # Perform inference
+            output_tensor = model.forward(input_tensor)
+            ```
+        
+        Note:
+            This function handles input preprocessing, model inference, and postprocessing. It supports multiple deep learning
+            backends such as PyTorch, ONNX, TensorRT, TensorFlow, and more, with device compatibility checks and backend-specific
+            operations.
+        """
         b, ch, h, w = im.shape  # batch, channel, height, width
         if self.fp16 and im.dtype != torch.float16:
             im = im.half()  # to FP16
@@ -737,11 +1584,50 @@ class DetectMultiBackend(nn.Module):
             return self.from_numpy(y)
 
     def from_numpy(self, x):
-        """Converts a NumPy array to a torch tensor, maintaining device compatibility."""
+        """
+        Convert NumPy array `x` to a torch tensor, maintaining device compatibility.
+        
+        Args:
+            x (numpy.ndarray): Input array to convert to torch tensor, with any shape.
+        
+        Returns:
+            (torch.Tensor): Converted torch tensor with the same data and shape as input array.
+            
+        Example:
+            ```python
+            import numpy as np
+            input_array = np.random.randn(3, 224, 224)  # Example input array
+            tensor = detect_multi_backend_instance.from_numpy(input_array)
+            print(tensor.shape)  # Should output: torch.Size([3, 224, 224])
+            ```
+            
+        Note:
+            This function ensures that the resulting torch tensor retains the appropriate device (CPU or GPU) setting.
+        """
         return torch.from_numpy(x).to(self.device) if isinstance(x, np.ndarray) else x
 
     def warmup(self, imgsz=(1, 3, 640, 640)):
-        """Performs a single inference warmup to initialize model weights, accepting an `imgsz` tuple for image size."""
+        """
+        Warms up the model by performing initial inference to prepare weights and memory allocations.
+        
+        Args:
+            imgsz (tuple[int]): Input image size tuple in the format (B, C, H, W), where B is batch size, C is number of 
+                channels, H is height, and W is width for the warmup run. Defaults to (1, 3, 640, 640).
+        
+        Returns:
+            None
+        
+        Example:
+            ```python
+            detect_backend = DetectMultiBackend(weights='yolov5s.pt')
+            detect_backend.warmup(imgsz=(1, 3, 320, 320))
+            ```
+        
+        Note:
+            The warmup process involves passing a blank tensor through the model to ensure that weights are moved to the 
+            selected device, and memory is allocated properly. This is particularly useful for models running on GPU or with 
+            FP16 precision.
+        """
         warmup_types = self.pt, self.jit, self.onnx, self.engine, self.saved_model, self.pb, self.triton
         if any(warmup_types) and (self.device.type != "cpu" or self.triton):
             im = torch.empty(*imgsz, dtype=torch.half if self.fp16 else torch.float, device=self.device)  # input
@@ -751,9 +1637,37 @@ class DetectMultiBackend(nn.Module):
     @staticmethod
     def _model_type(p="path/to/model.pt"):
         """
-        Determines model type from file path or URL, supporting various export formats.
-
-        Example: path='path/to/model.onnx' -> type=onnx
+        Determine the model type from a given file path or URL.
+        
+        Args:
+            p (str): File path or URL for the model.
+                Supported formats include PyTorch, TorchScript, ONNX, OpenVINO, TensorRT, CoreML, TensorFlow, TFLite, and PaddlePaddle.
+        
+        Returns:
+            (tuple[bool, bool, bool, bool, bool, bool, bool, bool, bool, bool, bool, bool]):
+                A tuple of booleans representing the type of model inferred from the file path or URL. Each boolean indicates:
+                - PyTorch (.pt)
+                - TorchScript
+                - ONNX (.onnx)
+                - OpenVINO (.xml)
+                - TensorRT (.engine)
+                - CoreML (.mlmodel)
+                - TensorFlow SavedModel
+                - TensorFlow GraphDef (.pb)
+                - TensorFlow Lite (.tflite)
+                - TensorFlow Edge TPU (.tflite)
+                - TensorFlow.js
+                - PaddlePaddle
+        
+        Example:
+            ```python
+            model_type = DetectMultiBackend._model_type("model.onnx")
+            assert model_type == (False, False, True, False, False, False, False, False, False, False, False, False)
+            ```
+        
+        Note:
+            This method relies on the file suffix and URL scheme to determine the type of model. Use this method to
+            programmatically infer the model type, facilitating subsequent backend-specific operations.
         """
         # types = [pt, jit, onnx, xml, engine, coreml, saved_model, pb, tflite, edgetpu, tfjs, paddle]
         from export import export_formats
@@ -770,7 +1684,29 @@ class DetectMultiBackend(nn.Module):
 
     @staticmethod
     def _load_metadata(f=Path("path/to/meta.yaml")):
-        """Loads metadata from a YAML file, returning strides and names if the file exists, otherwise `None`."""
+        """
+        Load metadata from a specified YAML file.
+        
+        Args:
+            f (Path): The path to the YAML file containing metadata.
+        
+        Returns:
+            (int, dict): A tuple containing the following:
+                - stride (int): The stride value extracted from the YAML file.
+                - names (dict): A dictionary of class names mapped by their index.
+        
+        Example:
+            ```python
+            from pathlib import Path
+            metadata_path = Path("path/to/meta.yaml")
+            stride, names = DetectMultiBackend._load_metadata(metadata_path)
+            print(f"Stride: {stride}")
+            print(f"Class Names: {names}")
+            ```
+        
+        Note:
+            Ensure the YAML file at the specified path exists and contains 'stride' and 'names' keys for successful metadata extraction.
+        """
         if f.exists():
             d = yaml_load(f)
             return d["stride"], d["names"]  # assign stride, names
@@ -788,7 +1724,27 @@ class AutoShape(nn.Module):
     amp = False  # Automatic Mixed Precision (AMP) inference
 
     def __init__(self, model, verbose=True):
-        """Initializes YOLOv5 model for inference, setting up attributes and preparing model for evaluation."""
+        """
+        Initializes an input-robust YOLO model with preprocessing, inference, and post-processing capabilities.
+        
+        Args:
+            model (torch.nn.Module): The YOLO model to be wrapped.
+            verbose (bool): If True, logs information about the initialization. Defaults to True.
+        
+        Returns:
+            None
+        
+        Example:
+            ```python
+            from ultralytics import YOLO
+            from ultralytics.models.common import AutoShape
+        
+            model = YOLO("yolov5s.pt")
+            auto_shape_model = AutoShape(model)
+            input_image = "path/to/image.jpg"
+            predictions = auto_shape_model(input_image)
+            ```
+        """
         super().__init__()
         if verbose:
             LOGGER.info("Adding AutoShape... ")
@@ -803,9 +1759,25 @@ class AutoShape(nn.Module):
 
     def _apply(self, fn):
         """
-        Applies to(), cpu(), cuda(), half() etc.
-
-        to model tensors excluding parameters or registered buffers.
+        Apply a function to model tensors excluding parameters or registered buffers.
+        
+        Args:
+            fn (Callable): The function to apply to the tensors. Common choices include `to()`, `cpu()`, `cuda()`, `half()`, etc.
+        
+        Returns:
+            (AutoShape): The current instance with the function applied.
+        
+        Note:
+            This method is useful for moving all tensors to a specific device (e.g., GPU) or changing their data types.
+        ```python
+            self = super()._apply(fn)
+            if self.pt:
+                m = self.model.model.model[-1] if self.dmb else self.model.model[-1]  # Detect()
+                m.stride = fn(m.stride)
+                m.grid = list(map(fn, m.grid))
+                if isinstance(m.anchor_grid, list):
+                    m.anchor_grid = list(map(fn, m.anchor_grid))
+        ```
         """
         self = super()._apply(fn)
         if self.pt:
@@ -819,9 +1791,41 @@ class AutoShape(nn.Module):
     @smart_inference_mode()
     def forward(self, ims, size=640, augment=False, profile=False):
         """
-        Performs inference on inputs with optional augment & profiling.
-
-        Supports various formats including file, URI, OpenCV, PIL, numpy, torch.
+        Perform inference on given image inputs with support for various input formats.
+        
+        Args:
+            ims (str | list[str] | pathlib.Path | list[pathlib.Path] | np.ndarray | list[np.ndarray] | torch.Tensor |
+                list[torch.Tensor] | PIL.Image.Image | list[PIL.Image.Image]):
+                Input images. Supported formats:
+                    - File path as string ('data/images/zidane.jpg') or Path object.
+                    - URL as string ('https://ultralytics.com/images/zidane.jpg').
+                    - OpenCV image (cv2.imread()) with shape (H, W, 3).
+                    - PIL image (Image.open()) with shape (H, W, 3).
+                    - NumPy array with shape (H, W, 3) or (B, C, H, W).
+                    - Torch tensor with shape (B, C, H, W).
+                    - List of any of the above.
+            size (int | tuple[int, int], optional): Target size for resizing input images, specified as an integer or 
+                a tuple (height, width). Defaults to 640.
+            augment (bool, optional): If True, apply image augmentations during inference. Defaults to False.
+            profile (bool, optional): If True, profile the inference process. Defaults to False.
+        
+        Returns:
+            (list[torch.Tensor]): List of detection results, where each tensor has shape (N, 6) representing
+                (x1, y1, x2, y2, conf, cls) for each detection.
+        
+        Example:
+            ```python
+            from PIL import Image
+            from ultralytics import YOLO
+            
+            model = YOLO("yolov5s.pt")
+            img = Image.open("path/to/image.jpg")
+            results = model.autoshape.forward(img)
+            ```
+        
+        Note:
+            Inference can be performed with Automatic Mixed Precision (AMP) if `amp` attribute is set to `True` and
+            the current hardware supports it.
         """
         # For size(height=640, width=1280), RGB images example inputs are:
         #   file:        ims = 'data/images/zidane.jpg'  # str or PosixPath
@@ -891,7 +1895,36 @@ class AutoShape(nn.Module):
 class Detections:
     # YOLOv5 detections class for inference results
     def __init__(self, ims, pred, files, times=(0, 0, 0), names=None, shape=None):
-        """Initializes the YOLOv5 Detections class with image info, predictions, filenames, timing and normalization."""
+        """
+        Initialize the Detections object, which stores prediction results from the YOLO model.
+        
+        Args:
+            ims (list[np.ndarray]): A list of images as numpy arrays, where each array represents an image in HWC format.
+            pred (list[torch.Tensor]): List of tensors containing the predicted bounding boxes and scores for each image. 
+                Each tensor has shape (N, 6) for (x1, y1, x2, y2, conf, cls).
+            files (list[str]): List of filenames corresponding to images.
+            times (tuple[float, float, float], optional): Profiling times, default is (0, 0, 0).
+            names (list[str], optional): List of class names used for predictions, default is None.
+            shape (tuple[int, int], optional): Shape of the input image, given as (height, width), default is None.
+        
+        Returns:
+            None
+        
+        Example:
+            ```python
+            ims = [cv2.imread("image1.jpg"), cv2.imread("image2.jpg")]
+            pred = [torch.tensor([[50, 50, 200, 200, 0.9, 1]]), torch.tensor([[30, 30, 150, 150, 0.8, 0]])]
+            names = ["class0", "class1"]
+            files = ["image1.jpg", "image2.jpg"]
+            times = (0.1, 0.2, 0.3)
+            detections = Detections(ims, pred, files, times, names)
+            ```
+        
+        Note:
+            This class simplifies tracing of predictions through different stages of the YOLOv5 inference pipeline, including 
+            preprocessing, model inference, and postprocessing. The Detections class maintains a convenient interface to 
+            access both raw and normalized bounding box coordinates and associated metadata.
+        """
         super().__init__()
         d = pred[0].device  # device
         gn = [torch.tensor([*(im.shape[i] for i in [1, 0, 1, 0]), 1, 1], device=d) for im in ims]  # normalizations
@@ -909,7 +1942,34 @@ class Detections:
         self.s = tuple(shape)  # inference BCHW shape
 
     def _run(self, pprint=False, show=False, save=False, crop=False, render=False, labels=True, save_dir=Path("")):
-        """Executes model predictions, displaying and/or saving outputs with optional crops and labels."""
+        """
+        Perform desired post-processing actions (e.g., pretty-print results, show images, save outputs).
+        
+        Args:
+            pprint (bool): If True, pretty-print the detection results.
+            show (bool): If True, display the detection results using the default image viewer.
+            save (bool): If True, save the detection results to the specified directory.
+            crop (bool): If True, crop detected objects and save them.
+            render (bool): If True, render annotated results onto the images.
+            labels (bool): If True, add labels to the bounding boxes in the rendered images.
+            save_dir (Path): Directory where the processed results will be saved. This is only used if `save` or `crop` 
+                is True.
+        
+        Returns:
+            (str | None): Formatted string of the results if `pprint` is True, otherwise None.
+        
+        Example:
+            ```python
+            det = Detections(ims, pred, files, times=[0.1, 0.2, 0.3], names=["person", "bike"])
+            result_str = det._run(pprint=True, show=True, save=False, crop=False, render=False, labels=True,
+                                  save_dir=Path("./outputs"))
+            print(result_str)  # Prints the formatted string of results.
+            ```
+        
+        Note:
+            Ensure that the `save_dir` exists when saving the results. The function handles different modes of result 
+            presentation, such as showing images using default viewers or displaying in Jupyter notebooks.
+        """
         s, crops = "", []
         for i, (im, pred) in enumerate(zip(self.ims, self.pred)):
             s += f"\nimage {i + 1}/{len(self.pred)}: {im.shape[0]}x{im.shape[1]} "  # string
@@ -966,39 +2026,156 @@ class Detections:
     def show(self, labels=True):
         """
         Displays detection results with optional labels.
-
-        Usage: show(labels=True)
+        
+        Args:
+            labels (bool): If True, include class labels and confidence scores in the displayed results.
+        
+        Returns:
+            None: The function does not return anything.
+        
+        Example:
+            ```python
+            detections = model(im)  # Perform inference
+            detections.show(labels=True)  # Display results with labels
+            ```
+        
+        Note:
+            This function leverages the `Annotator` class to draw bounding boxes and labels on images and then displays
+            them using either Jupyter notebook's display function or the default image viewer in other environments.
         """
         self._run(show=True, labels=labels)  # show results
 
     def save(self, labels=True, save_dir="runs/detect/exp", exist_ok=False):
         """
-        Saves detection results with optional labels to a specified directory.
-
-        Usage: save(labels=True, save_dir='runs/detect/exp', exist_ok=False)
+        Save detection results with optional labeling and directory creation.
+        
+        Args:
+            labels (bool): Flag to include labels on the saved images. Defaults to True.
+            save_dir (str | Path): Directory path where result images and optionally cropped images will be saved.
+                Defaults to 'runs/detect/exp'.
+            exist_ok (bool): Flag to allow existing directory content without creating a new directory.
+                Defaults to False.
+        
+        Returns:
+            None
+        
+        Example:
+            ```python
+            detections = Detections(ims, pred, files, names=names)
+            detections.save(labels=True, save_dir='runs/detect/exp', exist_ok=False)
+            ```
+        
+        Note:
+            If `exist_ok` is False, the function will create a unique directory by incrementing the name to avoid conflicts 
+            with existing directories. If `save` is True, the images and crops will be saved in the specified `save_dir`.
+        
+            This function is particularly useful to persist detection results for future reference, further analysis, or 
+            debugging.
         """
         save_dir = increment_path(save_dir, exist_ok, mkdir=True)  # increment save_dir
         self._run(save=True, labels=labels, save_dir=save_dir)  # save results
 
     def crop(self, save=True, save_dir="runs/detect/exp", exist_ok=False):
         """
-        Crops detection results, optionally saves them to a directory.
-
-        Args: save (bool), save_dir (str), exist_ok (bool).
+        Crop detected objects from the input images.
+        
+        Args:
+            save (bool): Whether to save the cropped images to disk. Default is True.
+            save_dir (str): Directory to save the cropped images. Default is 'runs/detect/exp'.
+            exist_ok (bool): Whether to overwrite the existing directory if it exists. Default is False.
+        
+        Returns:
+            (list[dict]): List of dictionaries, each containing information about a cropped image, with the keys:
+                - 'box' (torch.Tensor): Bounding box of the crop with shape (4,).
+                - 'conf' (torch.Tensor): Confidence score of the detection.
+                - 'cls' (torch.Tensor): Class of the detected object.
+                - 'label' (str): Label string with class name and confidence score.
+                - 'im' (np.ndarray): Cropped image as a numpy array.
+        
+        Example:
+            ```python
+            detections = model.detect(images)
+            crops = detections.crop(save=True, save_dir='runs/crops')
+            for crop in crops:
+                print(crop['label'], crop['im'].shape)
+            ```
+        
+        Note:
+            - If `save` is True, the cropped images will be saved in the specified `save_dir`, which will be incremented
+              automatically if `exist_ok` is False and the directory already exists.
+            - This function returns both the cropped images and their metadata, which can be useful for further analysis
+              or display.
         """
         save_dir = increment_path(save_dir, exist_ok, mkdir=True) if save else None
         return self._run(crop=True, save=save, save_dir=save_dir)  # crop results
 
     def render(self, labels=True):
-        """Renders detection results with optional labels on images; args: labels (bool) indicating label inclusion."""
+        """
+        Render detection results on an image by drawing the predicted bounding boxes and labels.
+        
+        Args:
+            imgs (np.ndarray | List[np.ndarray]): List of images as NumPy arrays on which detections were made.
+            annotator (ultradatasets.utils.plotting.Annotator, optional): The annotator instance used for drawing bounding boxes
+                and labels. Default is None.
+        
+        Returns:
+            (np.ndarray | List[np.ndarray]): The image or list of images with rendered detections.
+        
+        Example:
+            ```python
+            from PIL import Image
+            import requests
+            import io
+            import torch
+            from ultralytics import YOLO
+        
+            # Load a sample image
+            img_url = 'https://ultralytics.com/images/zidane.jpg'
+            response = requests.get(img_url)
+            img = Image.open(io.BytesIO(response.content))
+        
+            # Load the YOLO model
+            model = YOLO('yolov5s.pt')
+        
+            # Perform inference
+            results = model(auto_shape.forward([img]))
+        
+            # Render the detections on the image
+            rendered_img = results.render()
+        
+            # Show the image
+            rendered_img.show()
+            ```
+        
+        Note:
+            - Input images should be in RGB format.
+            - The function supports rendering on multiple images if a list of images is provided.
+            - This function is mainly used for visualization purposes in notebooks or GUI applications.
+        """
         self._run(render=True, labels=labels)  # render results
         return self.ims
 
     def pandas(self):
         """
-        Returns detections as pandas DataFrames for various box formats (xyxy, xyxyn, xywh, xywhn).
-
-        Example: print(results.pandas().xyxy[0]).
+        Convert detections to pandas DataFrames for each box format.
+        
+        Args:
+            None
+        
+        Returns:
+            (dict): Dictionary of pandas DataFrames, one for each box format (xyxy, xyxyn, xywh, xywhn):
+                - xyxy: DataFrame with columns ["xmin", "ymin", "xmax", "ymax", "confidence", "class", "name"].
+                - xyxyn: DataFrame with columns ["xmin", "ymin", "xmax", "ymax", "confidence", "class", "name"] (normalized).
+                - xywh: DataFrame with columns ["xcenter", "ycenter", "width", "height", "confidence", "class", "name"].
+                - xywhn: DataFrame with columns ["xcenter", "ycenter", "width", "height", "confidence", "class", "name"]
+                    (normalized).
+        
+        Example:
+            ```python
+            results = infer_image(image_path)
+            dfs = results.pandas()
+            print(dfs['xyxy'][0])  # print DataFrame for 'xyxy' format
+            ```
         """
         new = copy(self)  # return copy
         ca = "xmin", "ymin", "xmax", "ymax", "confidence", "class", "name"  # xyxy columns
@@ -1010,9 +2187,19 @@ class Detections:
 
     def tolist(self):
         """
-        Converts a Detections object into a list of individual detection results for iteration.
-
-        Example: for result in results.tolist():
+        Convert detection results to a list of individual detection results.
+        
+        Returns:
+            (list[Detections]): A list where each element is a `Detections` object for a single image, maintaining all
+                relevant detection attributes.
+        
+        Example:
+            ```python
+            detections = model.detect(imgs)
+            detections_list = detections.tolist()
+            for detection in detections_list:
+                print(detection.pandas().xyxy)
+            ```
         """
         r = range(self.n)  # iterable
         return [
@@ -1028,28 +2215,85 @@ class Detections:
         ]
 
     def print(self):
-        """Logs the string representation of the current object's state via the LOGGER."""
+        """
+        Logs detection results for each image, including class names and detection counts per class.
+        
+        Example:
+            ```python
+            detections = model.predict(images)
+            detections.print()
+            ```
+        ```python
+        def print(self):
+            print(self._run(pprint=True))  # print results
+        ```
+        """
         LOGGER.info(self.__str__())
 
     def __len__(self):
-        """Returns the number of results stored, overrides the default len(results)."""
+        """
+        Returns:
+            (int): Number of detections.
+        """
         return self.n
 
     def __str__(self):
-        """Returns a string representation of the model's results, suitable for printing, overrides default
-        print(results).
+        """
+        Returns a concise string representation of the detection results.
+        
+        Returns:
+            (str): A string summarizing the image detection results, including number of detections per class and their respective
+                confidences, along with processing time details.
+        
+        Example:
+            ```python
+            detections = Detections(ims, pred, files, times, names, shape)
+            print(str(detections))
+            ```
+        
+        Notes:
+            Used primarily for logging and quick inspection of detection outputs.
         """
         return self._run(pprint=True)  # print results
 
     def __repr__(self):
-        """Returns a string representation of the YOLOv5 object, including its class and formatted results."""
+        """
+        Return a string representation of the Detections object including its class and formatted results.
+        
+        Returns:
+            (str): A string representation of the Detections object, including class name and formatted results of the
+                detection process.
+        
+        Example:
+            ```python
+            detections = Detections(ims, pred, files, times, names, shape)
+            print(repr(detections))
+            ```
+        
+        Note:
+            This function is particularly useful for debugging and logging purposes, providing a clear, concise summary of
+            the Detections object.
+        """
         return f"YOLOv5 {self.__class__} instance\n" + self.__str__()
 
 
 class Proto(nn.Module):
     # YOLOv5 mask Proto module for segmentation models
     def __init__(self, c1, c_=256, c2=32):
-        """Initializes YOLOv5 Proto module for segmentation with input, proto, and mask channels configuration."""
+        """
+        Initialize the YOLOv5 Proto module for segmentation models.
+        
+        Args:
+            c1 (int): Number of input channels.
+            c_ (int): Number of intermediate channels, default is 256.
+            c2 (int): Number of output channels, default is 32.
+        
+        Example:
+            ```python
+            from ultralytics.models.common import Proto
+            proto = Proto(c1=64, c_=256, c2=32)
+            ```
+        """
         super().__init__()
         self.cv1 = Conv(c1, c_, k=3)
         self.upsample = nn.Upsample(scale_factor=2, mode="nearest")
@@ -1057,7 +2301,25 @@ class Proto(nn.Module):
         self.cv3 = Conv(c_, c2)
 
     def forward(self, x):
-        """Performs a forward pass using convolutional layers and upsampling on input tensor `x`."""
+        """
+        Applies convolutional layers and upsampling to generate segmentation masks from input tensor `x`.
+        
+        Args:
+            x (torch.Tensor): Input feature map tensor with shape (N, C1, H, W) where N is the batch size, C1 is the number of 
+                input channels, H is the height, and W is the width.
+        
+        Returns:
+            (torch.Tensor): Output tensor with shape (N, C2, H_out, W_out), where C2 is the number of mask channels, and 
+                H_out and W_out are the height and width after upsampling.
+        
+        Example:
+            ```python
+            proto_layer = Proto(c1=512, c_=256, c2=32)
+            input_tensor = torch.rand(1, 512, 64, 64)  # Example input tensor
+            output_tensor = proto_layer(input_tensor)
+            print(output_tensor.shape)  # Expected output shape: (1, 32, 128, 128)
+            ```
+        """
         return self.cv3(self.cv2(self.upsample(self.cv1(x))))
 
 
@@ -1066,8 +2328,26 @@ class Classify(nn.Module):
     def __init__(
         self, c1, c2, k=1, s=1, p=None, g=1, dropout_p=0.0
     ):  # ch_in, ch_out, kernel, stride, padding, groups, dropout probability
-        """Initializes YOLOv5 classification head with convolution, pooling, and dropout layers for input to output
-        channel transformation.
+        """
+        Initializes the Classify module for YOLOv5, transforming input feature maps to classification scores.
+        
+        Args:
+            c1 (int): Number of input channels.
+            c2 (int): Number of output channels corresponding to the number of classes.
+            k (int): Convolutional kernel size. Defaults to 1.
+            s (int): Convolutional stride size. Defaults to 1.
+            p (int | None): Convolutional padding size. Defaults to None which implies automatic padding.
+            g (int): Number of groups in convolutional layer. Defaults to 1.
+            dropout_p (float): Dropout probability. Defaults to 0.0.
+        
+        Returns:
+            (None): This method initializes the Classify instance without returning any value.
+        
+        Example:
+            ```python
+            classify_head = Classify(c1=2048, c2=1000, k=1, s=1, dropout_p=0.5)
+            output = classify_head(input_tensor)  # where input_tensor is of shape (B, 2048, 20, 20)
+            ```
         """
         super().__init__()
         c_ = 1280  # efficientnet_b0 size
@@ -1077,7 +2357,26 @@ class Classify(nn.Module):
         self.linear = nn.Linear(c_, c2)  # to x(b,c2)
 
     def forward(self, x):
-        """Processes input through conv, pool, drop, and linear layers; supports list concatenation input."""
+        """
+        Forward pass for the YOLOv5 classification head.
+        
+        This method takes an input tensor, applies convolution, pooling, and linear layers to produce classification scores.
+        
+        Args:
+            x (torch.Tensor | list[torch.Tensor]): Input tensor or list of tensors with shape (..., C_in, H, W),
+                where C_in is the number of input channels.
+        
+        Returns:
+            (torch.Tensor): Output tensor with shape (B, C_out), where B is the batch size and C_out is the number of classes.
+        
+        Example:
+            ```python
+            classify_head = Classify(c1=2048, c2=1000, k=1, s=1, dropout_p=0.5)
+            input_tensor = torch.rand(8, 2048, 20, 20)  # Example input tensor with shape (B, C_in, H, W)
+            output_tensor = classify_head(input_tensor)
+            print(output_tensor.shape)  # Should output: torch.Size([8, 1000])
+            ```
+        """
         if isinstance(x, list):
             x = torch.cat(x, 1)
         return self.linear(self.drop(self.pool(self.conv(x)).flatten(1)))
diff --git a/models/experimental.py b/models/experimental.py
index ab9b0ed23..45a69fe88 100644
--- a/models/experimental.py
+++ b/models/experimental.py
@@ -14,8 +14,19 @@ class Sum(nn.Module):
     """Weighted sum of 2 or more layers https://arxiv.org/abs/1911.09070."""
 
     def __init__(self, n, weight=False):
-        """Initializes a module to sum outputs of layers with number of inputs `n` and optional weighting, supporting 2+
-        inputs.
+        """
+        Initialize the Sum module to aggregate outputs from multiple layers, optionally with weights.
+        
+        Args:
+            n (int): Number of layers to sum. Must be 2 or more.
+            weight (bool): If True, applies weights to the inputs before summing.
+        
+        Returns:
+            None
+        
+        Notes:
+            Refer to "Weighted sum of 2 or more layers" at https://arxiv.org/abs/1911.09070 for detailed insights 
+            and usage scenarios.
         """
         super().__init__()
         self.weight = weight  # apply weights boolean
@@ -24,7 +35,26 @@ class Sum(nn.Module):
             self.w = nn.Parameter(-torch.arange(1.0, n) / 2, requires_grad=True)  # layer weights
 
     def forward(self, x):
-        """Processes input through a customizable weighted sum of `n` inputs, optionally applying learned weights."""
+        """
+        Compute a weighted or unweighted sum of input tensors.
+        
+        Args:
+            x (list[torch.Tensor]): List of input tensors to be summed, with each tensor having the same shape (N, D).
+        
+        Returns:
+            (torch.Tensor): The resulting tensor after summing the input tensors, maintaining the same shape (N, D).
+        
+        Example:
+            ```python
+            sum_layer = Sum(n=3, weight=False)
+            inputs = [torch.rand(1, 10), torch.rand(1, 10), torch.rand(1, 10)]
+            result = sum_layer.forward(inputs)
+            ```
+        
+        Note:
+            If `weight` is set to True when initializing the class, weights will be applied to the inputs before summing.
+            For more information, refer to "Weighted sum of 2 or more layers" at https://arxiv.org/abs/1911.09070.
+        """
         y = x[0]  # no weight
         if self.weight:
             w = torch.sigmoid(self.w) * 2
@@ -40,8 +70,29 @@ class MixConv2d(nn.Module):
     """Mixed Depth-wise Conv https://arxiv.org/abs/1907.09595."""
 
     def __init__(self, c1, c2, k=(1, 3), s=1, equal_ch=True):
-        """Initializes MixConv2d with mixed depth-wise convolutional layers, taking input and output channels (c1, c2),
-        kernel sizes (k), stride (s), and channel distribution strategy (equal_ch).
+        """
+        Initialize the MixConv2d module, handling mixed depth-wise convolutional operations.
+        
+        Args:
+            c1 (int): Number of input channels (C1).
+            c2 (int): Number of output channels (C2).
+            k (tuple[int]): Kernel sizes for the convolutional layers.
+            s (int): Stride value for the convolutional layers.
+            equal_ch (bool): Flag to determine if channels are distributed equally. True for equal channels per group, False
+                for equal weight.numel() per group.
+        
+        Example:
+            ```python
+            mixconv = MixConv2d(c1=32, c2=64, k=(1, 3, 5), s=1, equal_ch=True)
+            output = mixconv(input_tensor)
+            ```
+        
+        Note:
+            The `MixConv2d` layer applies multiple depth-wise convolutions with different kernel sizes in parallel, which 
+            can capture multi-scale features within a single layer. This technique is particularly useful for improving 
+            spatial feature extraction and reducing model complexity.
+        
+            Further reading: https://arxiv.org/abs/1907.09595
         """
         super().__init__()
         n = len(k)  # number of convolutions
@@ -63,8 +114,24 @@ class MixConv2d(nn.Module):
         self.act = nn.SiLU()
 
     def forward(self, x):
-        """Performs forward pass by applying SiLU activation on batch-normalized concatenated convolutional layer
-        outputs.
+        """
+        Perform forward pass by applying mixed depth-wise convolutions followed by batch normalization and SiLU activation.
+        
+        Args:
+            x (torch.Tensor): Input tensor with shape (N, C, H, W) where N is the batch size, C is the number of channels,
+                H is the height, and W is the width.
+        
+        Returns:
+            (torch.Tensor): Output tensor after applying mixed convolutions, batch normalization, and SiLU activation,
+                maintaining the shape (N, C', H', W') where C' is the output channels based on the convolutional layer
+                configuration.
+        
+        Example:
+            ```python
+            mixconv = MixConv2d(c1=32, c2=64, k=(1, 3), s=1)
+            x = torch.randn(16, 32, 128, 128)
+            output = mixconv(x)
+            ```
         """
         return self.act(self.bn(torch.cat([m(x) for m in self.m], 1)))
 
@@ -73,11 +140,51 @@ class Ensemble(nn.ModuleList):
     """Ensemble of models."""
 
     def __init__(self):
-        """Initializes an ensemble of models to be used for aggregated predictions."""
+        """
+        Initializes an ensemble of models for combined inference and aggregated predictions.
+        
+        Example:
+            ```python
+            ensemble = Ensemble()
+            model1 = MyModel1()
+            model2 = MyModel2()
+            ensemble.append(model1)
+            ensemble.append(model2)
+            ```
+        """
         super().__init__()
 
     def forward(self, x, augment=False, profile=False, visualize=False):
-        """Performs forward pass aggregating outputs from an ensemble of models.."""
+        """
+        Aggregates outputs from multiple models in the ensemble by concatenating them during the forward pass.
+        
+        Args:
+            x (torch.Tensor): Input tensor with shape (N, C, H, W) where N is the batch size, C is the number of channels,
+                H is the height, and W is the width.
+            augment (bool): Flag to apply test-time augmentation (TTA) during inference. Default is False.
+            profile (bool): If True, enables profiling of the forward pass. Default is False.
+            visualize (bool): If True, enables visualization of model predictions. Default is False.
+        
+        Returns:
+            (torch.Tensor): Aggregated output tensor from the ensemble models, with shape dependent on the number of models
+                and their architectures.
+        
+        Example:
+            ```python
+            from ultralytics import Ensemble
+            import torch
+        
+            # Initialize the ensemble
+            ensemble = Ensemble()
+            # Assume models are already added to the ensemble
+        
+            # Create a dummy input tensor
+            x = torch.randn(8, 3, 640, 640)  # Example input for 8 images of 3 channels and 640x640 resolution
+        
+            # Perform forward pass
+            output = ensemble.forward(x, augment=False, profile=False, visualize=False)
+            ```
+        """
         y = [module(x, augment, profile, visualize)[0] for module in self]
         # y = torch.stack(y).max(0)[0]  # max ensemble
         # y = torch.stack(y).mean(0)  # mean ensemble
@@ -87,9 +194,32 @@ class Ensemble(nn.ModuleList):
 
 def attempt_load(weights, device=None, inplace=True, fuse=True):
     """
-    Loads and fuses an ensemble or single YOLOv5 model from weights, handling device placement and model adjustments.
-
-    Example inputs: weights=[a,b,c] or a single model weights=[a] or weights=a.
+    Loads and fuses a YOLOv5 model or an ensemble of models from provided weights, adjusting device placement and model 
+    attributes for optimal performance.
+    
+    Args:
+        weights (str | list[str]): Path(s) to model weight file(s). It can be a single path or a list of paths.
+        device (torch.device | None, optional): Device to load the model on. If None, loads on CPU by default.
+        inplace (bool, optional): If True, enables inplace operations in certain layers like activation layers. 
+            Defaults to True.
+        fuse (bool, optional): Whether to fuse Conv2d + BatchNorm2d layers for speedup during inference. Defaults to True.
+    
+    Returns:
+        (torch.nn.Module): Loaded YOLOv5 model or an ensemble of models loaded onto the specified device.
+    
+    Example:
+        ```python
+        # Load a single model weight
+        model = attempt_load('yolov5s.pt')
+    
+        # Load an ensemble of models
+        model = attempt_load(['yolov5s.pt', 'yolov5m.pt', 'yolov5l.pt'])
+        ```
+    
+    Note:
+        - This function ensures compatibility and performance optimization by adjusting attributes and configurations of the
+          loaded model(s).
+        - If `fuse` is set to True, it will fuse Conv2d and BatchNorm2d layers within the model(s) to speed up inference.
     """
     from models.yolo import Detect, Model
 
diff --git a/models/tf.py b/models/tf.py
index 9884ec3db..b6b0f2f76 100644
--- a/models/tf.py
+++ b/models/tf.py
@@ -51,7 +51,28 @@ from utils.general import LOGGER, make_divisible, print_args
 class TFBN(keras.layers.Layer):
     # TensorFlow BatchNormalization wrapper
     def __init__(self, w=None):
-        """Initializes a TensorFlow BatchNormalization layer with optional pretrained weights."""
+        """
+        Initializes a TensorFlow BatchNormalization layer, optionally using pretrained weights for initialization.
+        
+        Args:
+            w (torch.nn.Module | None): PyTorch BatchNormalization layer whose weights are used to initialize the TensorFlow 
+                BatchNormalization layer. If None, the BatchNormalization layer is initialized with default parameters.
+        
+        Returns:
+            (None): This constructor does not return any value.
+        
+        Example:
+            ```python
+            import torch.nn as nn
+            from tensorflow.keras import layers
+            
+            # Create a PyTorch batch normalization layer
+            torch_bn = nn.BatchNorm2d(num_features=64)
+            
+            # Initialize a TFBN layer with PyTorch BN weights
+            tf_bn = TFBN(w=torch_bn)
+            ```
+        """
         super().__init__()
         self.bn = keras.layers.BatchNormalization(
             beta_initializer=keras.initializers.Constant(w.bias.numpy()),
@@ -62,7 +83,27 @@ class TFBN(keras.layers.Layer):
         )
 
     def call(self, inputs):
-        """Applies batch normalization to the inputs."""
+        """
+        Apply batch normalization to the given inputs using pretrained weights.
+        
+        Args:
+            inputs (tf.Tensor): Input tensor to normalize, with shape (batch_size, ..., channels).
+        
+        Returns:
+            (tf.Tensor): Batch-normalized tensor with same shape as the input.
+        
+        Example:
+            ```python
+            # Assume `inputs` is a TensorFlow tensor with shape (N, H, W, C)
+            bn_layer = TFBN(w=pretrained_weights)
+            normalized_output = bn_layer.call(inputs)
+            ```
+        
+        Note:
+            The `w` parameter used during initialization must be a PyTorch BatchNorm layer containing 
+            pretrained weights. Ensure the `w` object has `bias`, `weight`, `running_mean`, `running_var`, 
+            and `eps` attributes used for initializing the TFBN layer.
+        """
         return self.bn(inputs)
 
 
@@ -70,10 +111,28 @@ class TFPad(keras.layers.Layer):
     # Pad inputs in spatial dimensions 1 and 2
     def __init__(self, pad):
         """
-        Initializes a padding layer for spatial dimensions 1 and 2 with specified padding, supporting both int and tuple
-        inputs.
-
-        Inputs are
+        Initialize a padding layer for spatial dimensions 1 and 2.
+        
+        Args:
+            pad (int | tuple[int, int]): Padding size for the spatial dimensions. If an integer is provided, the same
+                padding is applied symmetrically to the spatial dimensions. If a tuple is provided, it should contain two
+                integers representing padding for height and width respectively.
+        
+        Returns:
+            None
+        
+        Example:
+            ```python
+            # Using integer padding
+            padding_layer = TFPad(1)
+        
+            # Using tuple padding
+            padding_layer = TFPad((1, 2))
+            ```
+        
+        Note:
+            The padding is added to the input tensor in TensorFlow format, i.e., [[0, 0], [pad_height, pad_height],
+            [pad_width, pad_width], [0, 0]].
         """
         super().__init__()
         if isinstance(pad, int):
@@ -82,7 +141,36 @@ class TFPad(keras.layers.Layer):
             self.pad = tf.constant([[0, 0], [pad[0], pad[0]], [pad[1], pad[1]], [0, 0]])
 
     def call(self, inputs):
-        """Pads input tensor with zeros using specified padding, suitable for int and tuple pad dimensions."""
+        """
+        Pad an input tensor with zeros in specified spatial dimensions.
+        
+        Args:
+            inputs (tf.Tensor): Input tensor to be padded, with shape (N, H, W, C).
+        
+        Returns:
+            (tf.Tensor): Padded tensor with shape (N, H + 2 * pad_height, W + 2 * pad_width, C).
+        
+        Example:
+            ```python
+            import tensorflow as tf
+            from your_module import TFPad
+        
+            # Create a sample input tensor with shape (1, 5, 5, 1)
+            input_tensor = tf.random.normal((1, 5, 5, 1))
+        
+            # Using integer padding
+            padding_layer = TFPad(1)
+            output_tensor = padding_layer.call(input_tensor)
+        
+            # Using tuple padding
+            padding_layer = TFPad((1, 2))
+            output_tensor = padding_layer.call(input_tensor)
+            ```
+        
+        Note:
+            The padding is added to the input tensor in TensorFlow format, i.e., [[0, 0], [pad_height, pad_height], 
+            [pad_width, pad_width], [0, 0]].
+        """
         return tf.pad(inputs, self.pad, mode="constant", constant_values=0)
 
 
@@ -90,10 +178,27 @@ class TFConv(keras.layers.Layer):
     # Standard convolution
     def __init__(self, c1, c2, k=1, s=1, p=None, g=1, act=True, w=None):
         """
-        Initializes a standard convolution layer with optional batch normalization and activation; supports only
-        group=1.
-
-        Inputs are ch_in, ch_out, weights, kernel, stride, padding, groups.
+        Performs a standard 2D convolution with optional batch normalization and activation in TensorFlow.
+        
+        Args:
+            c1 (int): Number of input channels.
+            c2 (int): Number of output channels.
+            k (int, optional): Kernel size of the convolution. Default is 1.
+            s (int, optional): Stride of the convolution. Default is 1.
+            p (int | None, optional): Padding size. If None, padding is automatically determined. Default is None.
+            g (int, optional): Number of groups for grouped convolution. Default is 1. Note: must be 1 for TF.
+            act (bool, optional): Boolean to include activation. Default is True.
+            w (torch.nn.Module | None, optional): Pretrained weights from a PyTorch model to initialize the layer. Default is None.
+        
+        Returns:
+            None: This function initializes an instance of the TFConv class.
+        
+        Example:
+            ```python
+            tf_conv = TFConv(c1=32, c2=64, k=3, s=1, w=pretrained_weights)
+            ```
+        Note:
+            TF v2.2 Conv2D does not support the 'groups' argument (must be 1).
         """
         super().__init__()
         assert g == 1, "TF v2.2 Conv2D does not support 'groups' argument"
@@ -113,7 +218,28 @@ class TFConv(keras.layers.Layer):
         self.act = activations(w.act) if act else tf.identity
 
     def call(self, inputs):
-        """Applies convolution, batch normalization, and activation function to input tensors."""
+        """
+        Apply convolution, batch normalization, and activation to input tensors.
+        
+        Args:
+            inputs (tf.Tensor): Input tensor with shape (N, H, W, C) where N is the batch size, H is the height, 
+                W is the width, and C is the number of channels.
+        
+        Returns:
+            (tf.Tensor): Output tensor after applying convolution, batch normalization, and activation, 
+                maintaining shape (N, H, W, C).
+        
+        Example:
+            ```python
+            input_tensor = tf.random.normal((1, 224, 224, 3))
+            conv_layer = TFConv(c1=3, c2=16, k=3, s=1)
+            output_tensor = conv_layer(input_tensor)
+            ```
+        
+        Note:
+            This method calls the `call` method of the internal sequential layers consisting of padding (if stride 
+            isn't 1), convolution, batch normalization (if enabled), and activation function (if enabled).
+        """
         return self.act(self.bn(self.conv(inputs)))
 
 
@@ -121,10 +247,28 @@ class TFDWConv(keras.layers.Layer):
     # Depthwise convolution
     def __init__(self, c1, c2, k=1, s=1, p=None, act=True, w=None):
         """
-        Initializes a depthwise convolution layer with optional batch normalization and activation for TensorFlow
-        models.
-
-        Input are ch_in, ch_out, weights, kernel, stride, padding, groups.
+        Initialize a depthwise convolution layer with optional batch normalization and activation for TensorFlow models.
+        
+        Args:
+            c1 (int): Number of input channels.
+            c2 (int): Number of output channels. Must be a multiple of `c1`.
+            k (int, optional): Size of the convolution kernel. Default is 1.
+            s (int, optional): Stride of the convolution. Default is 1.
+            p (int | tuple[int, int] | None, optional): Padding size; supports both integer and tuple inputs. Default is None.
+            act (bool, optional): Whether to apply an activation function. Default is True.
+            w (object | None, optional): Pretrained weights. Default is None.
+        
+        Returns:
+            (None): This constructor does not return any values.
+        
+        Example:
+            ```python
+            import keras
+            from models.tf import TFDWConv
+        
+            # Initialize the layer
+            conv_layer = TFDWConv(c1=32, c2=64, k=3, s=1, p=1, act=True, w=pretrained_weights)
+            ```
         """
         super().__init__()
         assert c2 % c1 == 0, f"TFDWConv() output={c2} must be a multiple of input={c1} channels"
@@ -142,7 +286,35 @@ class TFDWConv(keras.layers.Layer):
         self.act = activations(w.act) if act else tf.identity
 
     def call(self, inputs):
-        """Applies convolution, batch normalization, and activation function to input tensors."""
+        """
+        Applies depthwise convolution, batch normalization, and an activation function to the input tensors.
+        
+        Args:
+            inputs (tf.Tensor): Input tensor with shape (N, H, W, C), representing a batch of images.
+        
+        Returns:
+            (tf.Tensor): Resulting tensor after the depthwise convolution, batch normalization, and activation are applied, 
+                with shape (N, H', W', C') depending on the convolution parameters.
+        
+        Example:
+            ```python
+            import tensorflow as tf
+            from models.tf import TFDWConv
+        
+            # Dummy input tensor with shape (batch_size, height, width, channels)
+            inputs = tf.random.normal([8, 32, 32, 32])
+        
+            # Initialize depthwise convolution layer
+            conv_layer = TFDWConv(c1=32, c2=64, k=3, s=1, p=1, act=True)
+        
+            # Apply depthwise convolution
+            outputs = conv_layer(inputs)
+            ```
+        
+        Note:
+            Padding is added to the input tensor in TensorFlow format, i.e., [[0, 0], [pad_height, pad_height], [pad_width, 
+            pad_width], [0, 0]].
+        """
         return self.act(self.bn(self.conv(inputs)))
 
 
@@ -150,9 +322,37 @@ class TFDWConvTranspose2d(keras.layers.Layer):
     # Depthwise ConvTranspose2d
     def __init__(self, c1, c2, k=1, s=1, p1=0, p2=0, w=None):
         """
-        Initializes depthwise ConvTranspose2D layer with specific channel, kernel, stride, and padding settings.
-
-        Inputs are ch_in, ch_out, weights, kernel, stride, padding, groups.
+        Initialize a Depthwise ConvTranspose2D layer with specific channel, kernel, stride, and padding configurations.
+        
+        Args:
+            c1 (int): Number of input channels; must equal `c2`.
+            c2 (int): Number of output channels; must equal `c1`.
+            k (int): Kernel size; currently supports only `k=4`.
+            s (int): Stride size for the transposed convolution.
+            p1 (int): Padding applied to the original input; currently supports only `p1=1`.
+            p2 (int): Additional padding applied to the transposed output.
+            w (torch.nn.Module): Pre-trained weights, including both kernel and bias, for initialization.
+        
+        Returns:
+            (None): This constructor does not return any values.
+        
+        Example:
+            ```python
+            import tensorflow as tf
+            from models.tf import TFDWConvTranspose2d
+        
+            # Define input tensor
+            input_tensor = tf.random.normal([1, 64, 64, 32])
+        
+            # Initialize the TFDWConvTranspose2d layer
+            depthwise_conv_transpose2d = TFDWConvTranspose2d(c1=32, c2=32, k=4, s=2, p1=1, p2=0, w=pretrained_weights)
+        
+            # Apply the layer
+            output_tensor = depthwise_conv_transpose2d(input_tensor)
+            ```
+        
+        Note:
+            This layer is designed for depthwise convolution with specific constraints on kernel size and initial padding.
         """
         super().__init__()
         assert c1 == c2, f"TFDWConv() output={c2} must be equal to input={c1} channels"
@@ -174,7 +374,33 @@ class TFDWConvTranspose2d(keras.layers.Layer):
         ]
 
     def call(self, inputs):
-        """Processes input through parallel convolutions and concatenates results, trimming border pixels."""
+        """
+        Perform upsampling using depthwise transposed convolution, followed by concatenation across channels.
+        
+        Args:
+            inputs (tf.Tensor): Input tensor with shape (N, H, W, C1), where N is batch size, H is height, W is width, 
+                and C1 is the number of input channels.
+        
+        Returns:
+            (tf.Tensor): Output tensor after applying depthwise ConvTranspose2D and concatenation, with shape 
+                (N, (H-1)*stride + kernel_size, (W-1)*stride + kernel_size, C1). After upsampling, 1 pixel is cropped 
+                from the border of the output to match expected dimensions.
+        
+        Example:
+            ```python
+            # Define input tensor
+            input_tensor = tf.random.normal([1, 64, 64, 32])
+            
+            # Initialize the TFDWConvTranspose2d layer
+            depthwise_conv_transpose2d_layer = TFDWConvTranspose2d(c1=32, c2=32, k=4, s=2, p1=1, p2=0, w=w)
+        
+            # Apply the layer
+            output_tensor = depthwise_conv_transpose2d_layer(input_tensor)
+            ```
+        
+        Note:
+            This function handles specific kernel size (k=4) and padding constraints (p1=1) for depthwise ConvTranspose2D.
+        """
         return tf.concat([m(x) for m, x in zip(self.conv, tf.split(inputs, self.c1, 3))], 3)[:, 1:-1, 1:-1]
 
 
@@ -182,19 +408,53 @@ class TFFocus(keras.layers.Layer):
     # Focus wh information into c-space
     def __init__(self, c1, c2, k=1, s=1, p=None, g=1, act=True, w=None):
         """
-        Initializes TFFocus layer to focus width and height information into channel space with custom convolution
-        parameters.
-
-        Inputs are ch_in, ch_out, kernel, stride, padding, groups.
+        Initializes TFFocus layer to focus width and height information into channel space with custom convolution parameters.
+        
+        Args:
+            c1 (int): Number of input channels.
+            c2 (int): Number of output channels.
+            k (int, optional): Size of the convolutional kernel. Default is 1.
+            s (int, optional): Stride value for the convolutional layer. Default is 1.
+            p (int | None, optional): Padding value. If None, will be automatically determined based on `k`. Default is None.
+            g (int, optional): Number of groups for the convolution. Default is 1.
+            act (bool, optional): Whether to use an activation layer. Default is True.
+            w (torch.nn.Module | None, optional): Pre-trained weight object containing convolution, batch norm, and activation
+                layers. Default is None.
+        
+        Returns:
+            None
+        
+        Example:
+            ```python
+            focus_layer = TFFocus(c1=3, c2=64, k=3, s=1, p=1, act=True)
+            output = focus_layer(inputs)
+            ```
+        
+        Note:
+            Ensure that the input tensor dimensions match the expected values for width and height focusing to operate correctly.
         """
         super().__init__()
         self.conv = TFConv(c1 * 4, c2, k, s, p, g, act, w.conv)
 
     def call(self, inputs):
         """
-        Performs pixel shuffling and convolution on input tensor, downsampling by 2 and expanding channels by 4.
-
-        Example x(b,w,h,c) -> y(b,w/2,h/2,4c).
+        Perform pixel shuffling and convolution on the input tensor, converting spatial dimensions into channel space.
+        
+        Args:
+            inputs (tf.Tensor): Input tensor with shape (B, H, W, C), where B is the batch size, H is the height, 
+                W is the width, and C is the number of channels.
+        
+        Returns:
+            (tf.Tensor): Output tensor after pixel shuffling and convolution, with shape (B, H/2, W/2, 4C).
+        
+        Example:
+            ```python
+            focus_layer = TFFocus(c1=32, c2=64, k=1, s=1)
+            output = focus_layer(inputs)  # inputs should be a tensor with shape (B, H, W, C)
+            ```
+        
+        Note:
+            Ensure input tensor dimensions match expected values for correct width and height focusing.
         """
         inputs = [inputs[:, ::2, ::2, :], inputs[:, 1::2, ::2, :], inputs[:, ::2, 1::2, :], inputs[:, 1::2, 1::2, :]]
         return self.conv(tf.concat(inputs, 3))
@@ -204,10 +464,38 @@ class TFBottleneck(keras.layers.Layer):
     # Standard bottleneck
     def __init__(self, c1, c2, shortcut=True, g=1, e=0.5, w=None):
         """
-        Initializes a standard bottleneck layer for TensorFlow models, expanding and contracting channels with optional
-        shortcut.
-
-        Arguments are ch_in, ch_out, shortcut, groups, expansion.
+        Initialize a standard bottleneck layer for TensorFlow models, typically used for residual connections in a network.
+        
+        Args:
+            c1 (int): Number of input channels.
+            c2 (int): Number of output channels.
+            shortcut (bool, optional): Whether to include a shortcut connection. Default is True.
+            g (int, optional): Number of groups for group convolution. Default is 1.
+            e (float, optional): Expansion factor for hidden channels. Default is 0.5.
+            w (object, optional): Pretrained weights from a PyTorch model to initialize the layer. Default is None.
+        
+        Returns:
+            None
+        
+        Example:
+            ```python
+            import tensorflow as tf
+        
+            # Initialize the TFBottleneck layer
+            c1, c2 = 64, 128
+            bottleneck_layer = TFBottleneck(c1, c2)
+        
+            # Define input tensor
+            inputs = tf.random.normal([1, 32, 32, c1])
+        
+            # Apply the bottleneck layer
+            outputs = bottleneck_layer(inputs)
+            print(outputs.shape)  # Expected output shape: (1, 32, 32, c2)
+            ```
+        
+        Note:
+            The bottleneck layer can be customized using pretrained weights for improved performance. Ensure the input tensor
+            dimensions match the expected values when applying the bottleneck transformation.
         """
         super().__init__()
         c_ = int(c2 * e)  # hidden channels
@@ -216,8 +504,27 @@ class TFBottleneck(keras.layers.Layer):
         self.add = shortcut and c1 == c2
 
     def call(self, inputs):
-        """Performs forward pass; if shortcut is True & input/output channels match, adds input to the convolution
-        result.
+        """
+        Perform forward pass of the TFBottleneck module.
+        
+        Args:
+            inputs (tf.Tensor): Input tensor with shape (N, H, W, C), where N is the batch size, H is the height,
+                W is the width, and C is the number of channels.
+        
+        Returns:
+            (tf.Tensor): Output tensor with shape (N, H, W, C2) where C2 is the number of output channels after the
+                bottleneck operation.
+        
+        Example:
+            ```python
+            bottleneck = TFBottleneck(64, 128, shortcut=True)
+            x = tf.random.uniform((1, 128, 128, 64))
+            y = bottleneck(x)
+            ```
+        
+        Note:
+            If `self.add` is True, the function will add the input tensor to the convolution output. This typically occurs
+            when the input and output channels are the same, and the `shortcut` parameter is set to True.
         """
         return inputs + self.cv2(self.cv1(inputs)) if self.add else self.cv2(self.cv1(inputs))
 
@@ -225,7 +532,33 @@ class TFBottleneck(keras.layers.Layer):
 class TFCrossConv(keras.layers.Layer):
     # Cross Convolution
     def __init__(self, c1, c2, k=3, s=1, g=1, e=1.0, shortcut=False, w=None):
-        """Initializes cross convolution layer with optional expansion, grouping, and shortcut addition capabilities."""
+        """
+        Perform an enhanced cross convolution operation with optional expansion, grouping, and shortcut addition capabilities.
+        
+        Args:
+            c1 (int): Number of input channels.
+            c2 (int): Number of output channels.
+            k (int): Kernel size for the convolution operations. Default is 3.
+            s (int): Stride size for the convolution operations. Default is 1.
+            g (int): Number of groups for the grouped convolution. Default is 1.
+            e (float): Expansion coefficient for intermediate channels. Default is 1.0.
+            shortcut (bool): Whether to apply a shortcut connection (residual connection). Default is False.
+            w (object | None): Pretrained weights object containing convolution parameters. Default is None.
+        
+        Returns:
+            None: This constructor initializes an instance of the TFCrossConv class.
+        
+        Example:
+            ```python
+            cross_conv_layer = TFCrossConv(c1=32, c2=64, k=5, s=2, w=pretrained_weights)
+            output = cross_conv_layer(inputs)  # Input tensor should have a shape compatible with these parameters
+            ```
+        
+        Note:
+            The cross convolution operation applies a two-step convolutions with different kernel shapes `(1, k)` and `(k, 1)`, 
+            preceded by an optional expansion through an intermediate layer. When `shortcut` is True, the input is directly 
+            added to the output of the two-step convolutions.
+        """
         super().__init__()
         c_ = int(c2 * e)  # hidden channels
         self.cv1 = TFConv(c1, c_, (1, k), (1, s), w=w.cv1)
@@ -233,15 +566,75 @@ class TFCrossConv(keras.layers.Layer):
         self.add = shortcut and c1 == c2
 
     def call(self, inputs):
-        """Passes input through two convolutions optionally adding the input if channel dimensions match."""
+        """
+        Perform cross convolution operations on input tensors.
+        
+        Args:
+            inputs (tf.Tensor): Input tensor with shape (N, H, W, C), where N is the batch size, H is the height,
+                W is the width, and C is the number of channels.
+        
+        Returns:
+            (tf.Tensor): Tensor after applying cross convolution operations, with shape (N, H, W, C2) where C2 is the
+                number of output channels.
+        
+        Example:
+            ```python
+            import tensorflow as tf
+            
+            # Define input tensor
+            input_tensor = tf.random.normal([1, 64, 64, 32])
+            
+            # Initialize the TFCrossConv layer
+            cross_conv_layer = TFCrossConv(c1=32, c2=64, k=3, s=1, g=1, e=1.0, shortcut=True)
+            
+            # Apply the layer
+            output_tensor = cross_conv_layer(input_tensor)
+            print(output_tensor.shape)  # Expected output shape: (1, 64, 64, 64)
+            ```
+        
+        Note:
+            If `shortcut` is True and the number of input channels (`c1`) equals the number of output channels (`c2`),
+            a shortcut connection is added between the input and the output.
+        """
         return inputs + self.cv2(self.cv1(inputs)) if self.add else self.cv2(self.cv1(inputs))
 
 
 class TFConv2d(keras.layers.Layer):
     # Substitution for PyTorch nn.Conv2D
     def __init__(self, c1, c2, k, s=1, g=1, bias=True, w=None):
-        """Initializes a TensorFlow 2D convolution layer, mimicking PyTorch's nn.Conv2D functionality for given filter
-        sizes and stride.
+        """
+        Initialize a TensorFlow Conv2D layer, mimicking the behavior of PyTorch's Conv2D, with optional pretrained weights.
+        
+        Args:
+            c1 (int): Number of input channels.
+            c2 (int): Number of output channels.
+            k (int | tuple[int, int]): Size of the convolutional kernel.
+            s (int, optional): Stride size. Defaults to 1.
+            g (int, optional): Number of groups. Only supported value is 1. Defaults to 1.
+            bias (bool, optional): Whether to include a bias term. Defaults to True.
+            w (torch.nn.Module | None, optional): Pretrained weights taken from a PyTorch model. Defaults to None.
+        
+        Example:
+            ```python
+            import torch
+            from models.tf import TFConv2d
+        
+            # Define parameters
+            c1, c2, k, s = 3, 64, 3, 1
+        
+            # Pretrained weights from a PyTorch model
+            pretrained_weights = torch.nn.Conv2d(c1, c2, k)
+        
+            # Initialize TFConv2d layer
+            conv_layer = TFConv2d(c1=c1, c2=c2, k=k, s=s, bias=True, w=pretrained_weights)
+            ```
+        
+        Note:
+            TensorFlow's `keras.layers.Conv2D` does not support the 'groups' argument prior to version 2.2, which limits
+            `g` to 1.
+        
+        Returns:
+            None: This constructor initializes the Conv2D layer within the class.
         """
         super().__init__()
         assert g == 1, "TF v2.2 Conv2D does not support 'groups' argument"
@@ -256,7 +649,38 @@ class TFConv2d(keras.layers.Layer):
         )
 
     def call(self, inputs):
-        """Applies a convolution operation to the inputs and returns the result."""
+        """
+        Provide only the docstring content, without quotation marks or function code.
+        
+        Apply a convolution operation to the input tensor.
+        
+        Args:
+            inputs (tf.Tensor): Input tensor with shape (B, H, W, C), where B is the batch size, H is the height,
+                W is the width, and C is the number of channels.
+        
+        Returns:
+            (tf.Tensor): Tensor resulting from the convolution operation, with shape (B, H_out, W_out, C_out) where H_out and
+                W_out are the output height and width, and C_out is the number of output channels.
+        
+        Example:
+            ```python
+            import tensorflow as tf
+            from models.tf import TFConv2d
+        
+            # Example input tensor
+            input_tensor = tf.random.normal([1, 32, 32, 3])
+        
+            # Initialize Conv2D layer
+            conv2d_layer = TFConv2d(c1=3, c2=16, k=3, s=1, bias=True, w=pretrained_weights)
+        
+            # Apply Conv2D layer
+            output_tensor = conv2d_layer(input_tensor)
+            ```
+            
+        Note:
+            This function uses TensorFlow's Conv2D operation to simulate PyTorch's nn.Conv2D behavior. The layer supports only
+            single-group convolutions (`g=1`) and padding is added manually when necessary.
+        """
         return self.conv(inputs)
 
 
@@ -264,10 +688,29 @@ class TFBottleneckCSP(keras.layers.Layer):
     # CSP Bottleneck https://github.com/WongKinYiu/CrossStagePartialNetworks
     def __init__(self, c1, c2, n=1, shortcut=True, g=1, e=0.5, w=None):
         """
-        Initializes CSP bottleneck layer with specified channel sizes, count, shortcut option, groups, and expansion
-        ratio.
-
-        Inputs are ch_in, ch_out, number, shortcut, groups, expansion.
+        Initializes CSP bottleneck layer with specified input/output channels, layer count, and network topology options.
+        
+        Args:
+            c1 (int): Number of input channels.
+            c2 (int): Number of output channels.
+            n (int): Number of bottleneck layers. Default is 1.
+            shortcut (bool): Whether to use shortcut connections or not.
+            g (int): Number of groups for group convolution. Default is 1.
+            e (float): Expansion ratio for hidden layers. Default is 0.5.
+            w (object): Weights container to initialize the layers.
+        
+        Returns:
+            (keras.layers.Layer): Constructed TensorFlow layer with CSP bottleneck configuration.
+        
+        Example:
+            ```python
+            csp_bottleneck = TFBottleneckCSP(c1=64, c2=128, n=2, shortcut=True, g=1, e=0.5, w=weights)
+            output = csp_bottleneck(inputs)
+            ```
+        
+        Note:
+            Uses `TFConv` and `TFConv2d` for convolution operations, `TFBN` for batch normalization, and Keras Swish
+            activation by default. This is based on the Cross Stage Partial Networks (CSPNet) architecture.
         """
         super().__init__()
         c_ = int(c2 * e)  # hidden channels
@@ -280,8 +723,36 @@ class TFBottleneckCSP(keras.layers.Layer):
         self.m = keras.Sequential([TFBottleneck(c_, c_, shortcut, g, e=1.0, w=w.m[j]) for j in range(n)])
 
     def call(self, inputs):
-        """Processes input through the model layers, concatenates, normalizes, activates, and reduces the output
-        dimensions.
+        """
+        Applies a CSP (Cross Stage Partial Networks) Bottleneck convolutional block to the input tensor.
+        
+        Args:
+            inputs (tf.Tensor): Input tensor with shape (N, H, W, C), where N represents the batch size, H is the height, 
+                W is the width, and C is the number of channels.
+        
+        Returns:
+            (tf.Tensor): Output tensor after applying the CSP bottleneck block, which maintains the same batch size and 
+                spatial dimensions but with modified channel dimensions depending on the convolutions' configurations.
+        
+        Example:
+            ```python
+            import tensorflow as tf
+            from yolov5_models import TFBottleneckCSP
+        
+            # Define input tensor with shape (batch_size, height, width, channels)
+            inputs = tf.random.normal([1, 128, 128, 64])
+        
+            # Initialize TFBottleneckCSP layer
+            bottleneck_csp_layer = TFBottleneckCSP(c1=64, c2=128, n=1, shortcut=True, g=1, e=0.5)
+        
+            # Apply the layer to input tensor
+            outputs = bottleneck_csp_layer(inputs)
+            print(outputs.shape)  # Expected output shape: (1, 128, 128, 128)
+            ```
+        
+        Note:
+            The CSP architecture helps in strengthening gradient flow across the network, improving training dynamics, and 
+            ensuring efficient parameter utilization.
         """
         y1 = self.cv3(self.m(self.cv1(inputs)))
         y2 = self.cv2(inputs)
@@ -292,9 +763,31 @@ class TFC3(keras.layers.Layer):
     # CSP Bottleneck with 3 convolutions
     def __init__(self, c1, c2, n=1, shortcut=True, g=1, e=0.5, w=None):
         """
-        Initializes CSP Bottleneck with 3 convolutions, supporting optional shortcuts and group convolutions.
-
-        Inputs are ch_in, ch_out, number, shortcut, groups, expansion.
+        Perform CSP bottleneck operations with 3 convolutions, supporting optional shortcuts and group convolutions.
+        
+        Args:
+            c1 (int): Number of input channels.
+            c2 (int): Number of output channels.
+            n (int): Number of bottleneck layers to apply.
+            shortcut (bool): Determines whether to use shortcuts. Default is True.
+            g (int): Number of groups for convolutions. Default is 1.
+            e (float): Expansion ratio for bottleneck channels. Default is 0.5.
+            w (torch.nn.Module | None): Pretrained weights from a PyTorch model to initialize the TensorFlow layer. Default is None.
+        
+        Returns:
+            None: This method initializes the TFC3 layer.
+        
+        Example:
+            ```python
+            # Example usage of TFC3
+            tfc3_layer = TFC3(c1=64, c2=128, n=3, shortcut=True, g=1, e=0.5)
+            input_tensor = tf.random.normal([1, 64, 64, 64])
+            output_tensor = tfc3_layer(input_tensor)
+            ```
+        
+        Note:
+            This layer implements CSPNet architecture with 3 convolutions, integrated in TensorFlow. Ideal for deep learning
+            models that require efficient channel-wise transformations while maintaining the original network dimensions.
         """
         super().__init__()
         c_ = int(c2 * e)  # hidden channels
@@ -305,9 +798,23 @@ class TFC3(keras.layers.Layer):
 
     def call(self, inputs):
         """
-        Processes input through a sequence of transformations for object detection (YOLOv5).
-
-        See https://github.com/ultralytics/yolov5.
+        Perform a forward pass through the CSP Bottleneck layer with 3 convolutions.
+        
+        Args:
+            inputs (tf.Tensor): Input tensor to the layer, with shape (batch_size, height, width, channels).
+        
+        Returns:
+            (tf.Tensor): Output tensor produced after applying CSP Bottleneck layer transformations.
+        
+        Example:
+            ```python
+            csp_bottleneck = TFC3(c1=64, c2=128, n=2, shortcut=True, g=1, e=0.5)
+            output_tensor = csp_bottleneck(input_tensor)
+            ```
+        
+        Note:
+            This layer is part of the Ultralytics YOLOv5 model configuration for TensorFlow.
+            See https://github.com/ultralytics/yolov5 for more details.
         """
         return self.cv3(tf.concat((self.m(self.cv1(inputs)), self.cv2(inputs)), axis=3))
 
@@ -316,9 +823,23 @@ class TFC3x(keras.layers.Layer):
     # 3 module with cross-convolutions
     def __init__(self, c1, c2, n=1, shortcut=True, g=1, e=0.5, w=None):
         """
-        Initializes layer with cross-convolutions for enhanced feature extraction in object detection models.
-
-        Inputs are ch_in, ch_out, number, shortcut, groups, expansion.
+        Perform non-maximum suppression (NMS) on prediction boxes.
+        
+        Args:
+            c1 (int): Number of input channels.
+            c2 (int): Number of output channels.
+            n (int, optional): Number of CrossConv layers. Defaults to 1.
+            shortcut (bool, optional): Whether to use shortcut connection. Defaults to True.
+            g (int, optional): Number of groups for grouped convolution. Defaults to 1.
+            e (float, optional): Expansion ratio. Defaults to 0.5.
+            w (object, optional): Pretrained weights from a PyTorch model.
+        
+        Returns:
+            None
+        
+        Note:
+            This class is a part of TensorFlow, Keras, and TFLite versions of YOLOv5 as authored in
+            https://github.com/ultralytics/yolov5/pull/1127. For usage, see https://github.com/ultralytics/yolov5.
         """
         super().__init__()
         c_ = int(c2 * e)  # hidden channels
@@ -330,14 +851,56 @@ class TFC3x(keras.layers.Layer):
         )
 
     def call(self, inputs):
-        """Processes input through cascaded convolutions and merges features, returning the final tensor output."""
+        """
+        TFC3x.call(inputs)
+        
+        Processes input through cross-convolutions and merges features for enhanced detection.
+        
+        Args:
+            inputs (tf.Tensor): Input tensor with shape (batch_size, height, width, channels).
+        
+        Returns:
+            (tf.Tensor): Output tensor after processing through cross-convolutions and feature merging, with shape 
+                (batch_size, new_height, new_width, new_channels).
+        
+        Example:
+            ```python
+            tfc3x_layer = TFC3x(c1=64, c2=128, n=3, shortcut=True, g=1, e=0.5)
+            input_tensor = tf.random.normal([1, 64, 64, 64])
+            output_tensor = tfc3x_layer(input_tensor)
+            ```
+            
+        Note:
+            This class is part of the TensorFlow, Keras, and TFLite versions of YOLOv5. See https://github.com/ultralytics/yolov5
+            for more information.
+        """
         return self.cv3(tf.concat((self.m(self.cv1(inputs)), self.cv2(inputs)), axis=3))
 
 
 class TFSPP(keras.layers.Layer):
     # Spatial pyramid pooling layer used in YOLOv3-SPP
     def __init__(self, c1, c2, k=(5, 9, 13), w=None):
-        """Initializes a YOLOv3-SPP layer with specific input/output channels and kernel sizes for pooling."""
+        """
+        Initialize a spatial pyramid pooling (SPP) layer for YOLO models.
+        
+        Args:
+            c1 (int): Number of input channels.
+            c2 (int): Number of output channels.
+            k (tuple[int, int, int]): Kernel sizes for the spatial pooling layers. Default is (5, 9, 13).
+            w (object | None): Weights from a pretrained model. Default is None.
+        
+        Returns:
+            None
+        
+        Example:
+            ```python
+            yolo_spp = TFSPP(c1=256, c2=512, k=(5, 9, 13), w=pretrained_weights)
+            ```
+        
+        Notes:
+            The SPP layer is designed to increase the receptive field by applying a series of max pooling operations with 
+            large kernel sizes, improving the detection of objects at various scales in YOLO models.
+        """
         super().__init__()
         c_ = c1 // 2  # hidden channels
         self.cv1 = TFConv(c1, c_, 1, 1, w=w.cv1)
@@ -345,7 +908,28 @@ class TFSPP(keras.layers.Layer):
         self.m = [keras.layers.MaxPool2D(pool_size=x, strides=1, padding="SAME") for x in k]
 
     def call(self, inputs):
-        """Processes input through two TFConv layers and concatenates with max-pooled outputs at intermediate stage."""
+        """
+        Perform spatial pyramid pooling (SPP) on the input tensor to extract multi-scale features.
+        
+        Args:
+            inputs (tf.Tensor): Input tensor from the previous layer with shape (B, H, W, C), where B is the batch size, 
+                H is the height, W is the width, and C is the number of input channels.
+        
+        Returns:
+            (tf.Tensor): Output tensor with multi-scale features, after applying SPP and concatenation. The shape of 
+                the output tensor will be (B, H, W, c2), where c2 is the number of output channels.
+        
+        Example:
+            ```python
+            spp_layer = TFSPP(c1=256, c2=512, k=(5, 9, 13))
+            output = spp_layer(inputs)
+            ```
+        
+        Note:
+            The layer performs convolution and max pooling with different pool sizes before concatenating the results 
+            for enhanced feature extraction. This is typically used in object detection models like YOLO for capturing 
+            multi-scale context.
+        """
         x = self.cv1(inputs)
         return self.cv2(tf.concat([x] + [m(x) for m in self.m], 3))
 
@@ -353,8 +937,26 @@ class TFSPP(keras.layers.Layer):
 class TFSPPF(keras.layers.Layer):
     # Spatial pyramid pooling-Fast layer
     def __init__(self, c1, c2, k=5, w=None):
-        """Initializes a fast spatial pyramid pooling layer with customizable in/out channels, kernel size, and
-        weights.
+        """
+        Initialize a TFSPPF (Spatial Pyramid Pooling-Fast) layer with specified parameters.
+        
+        Args:
+            c1 (int): Number of input channels.
+            c2 (int): Number of output channels.
+            k (int): Kernel size for max pooling. Default is 5.
+            w (None | dict): Weights to initialize the layer. A dictionary containing the necessary weights for the layers.
+        
+        Returns:
+            None: This method does not return anything as it initializes the layer in place.
+        
+        Example:
+            ```python
+            tf_sppf = TFSPPF(c1=256, c2=512, k=5, w=weights)
+            ```
+        
+        Note:
+            This TFSPPF layer is specifically designed for YOLOv5 architecture, offering a faster variant of spatial pyramid
+            pooling by using fewer layers for efficiency while maintaining performance.
         """
         super().__init__()
         c_ = c1 // 2  # hidden channels
@@ -363,8 +965,24 @@ class TFSPPF(keras.layers.Layer):
         self.m = keras.layers.MaxPool2D(pool_size=k, strides=1, padding="SAME")
 
     def call(self, inputs):
-        """Executes the model's forward pass, concatenating input features with three max-pooled versions before final
-        convolution.
+        """
+        Perform spatial pyramid pooling-Fast (SPPF) on input tensors, concatenating pooled features with the original tensor.
+        
+        Args:
+            inputs (tf.Tensor): Input tensor with shape (N, H, W, C) for batch size N, height H, width W, and channels C.
+        
+        Returns:
+            (tf.Tensor): Output tensor with shape (N, H, W, C_out), where C_out is the number of output channels.
+        
+        Example:
+            ```python
+            layer = TFSPPF(c1=256, c2=512, k=5)
+            output = layer(inputs)  # inputs should be a tensor of shape (N, H, W, 256)
+            ```
+        
+        Note:
+            This TFSPPF layer is specifically designed for YOLOv5 architecture, offering a faster variant of spatial pyramid
+            pooling by using fewer layers for efficiency while maintaining performance.
         """
         x = self.cv1(inputs)
         y1 = self.m(x)
@@ -375,8 +993,27 @@ class TFSPPF(keras.layers.Layer):
 class TFDetect(keras.layers.Layer):
     # TF YOLOv5 Detect layer
     def __init__(self, nc=80, anchors=(), ch=(), imgsz=(640, 640), w=None):
-        """Initializes YOLOv5 detection layer for TensorFlow with configurable classes, anchors, channels, and image
-        size.
+        """
+        Initializes YOLOv5 detection layer for TensorFlow.
+        
+        Args:
+            nc (int, optional): Number of classes. Defaults to 80.
+            anchors (tuple, optional): Tuple of anchor box dimensions. Defaults to ().
+            ch (tuple, optional): Number of input channels for each detection layer. Defaults to ().
+            imgsz (tuple[int, int], optional): Input image size as (height, width). Defaults to (640, 640).
+            w (object, optional): Weights object containing pretrained weight tensors and other parameters.
+        
+        Returns:
+            None
+        
+        Note:
+            This detection layer forms part of the YOLOv5 architecture for object detection tasks in TensorFlow, handling the
+            prediction of bounding boxes and class probabilities for detected objects.
+        
+        Example:
+            ```python
+            detection_layer = TFDetect(nc=80, anchors=((10, 13, 16, 30, 33, 23),), ch=(256, 512, 1024), imgsz=(640, 640))
+            ```
         """
         super().__init__()
         self.stride = tf.convert_to_tensor(w.stride.numpy(), dtype=tf.float32)
@@ -395,7 +1032,28 @@ class TFDetect(keras.layers.Layer):
             self.grid[i] = self._make_grid(nx, ny)
 
     def call(self, inputs):
-        """Performs forward pass through the model layers to predict object bounding boxes and classifications."""
+        """
+        Perform object detection computations using inputs from multiple feature layers, applying activation, convolution, and 
+        grid-based adjustments to generate final output.
+        
+        Args:
+            inputs (list[tf.Tensor]): List of input tensors from multiple feature layers, each with shape (B, H, W, C), 
+                where B is batch size, H is height, W is width, and C is the number of channels.
+        
+        Returns:
+            (tf.Tensor): Final processed tensor with object detection information, shape (B, N, 85), where N is the number 
+                of predictions and 85 represents the output features (4 box coords + 1 objectness score + 80 class scores).
+        
+        Example:
+            ```python
+            detection_layer = TFDetect(nc=80, anchors=((10, 13, 16, 30, 33, 23),), ch=(256, 512, 1024), imgsz=(640, 640))
+            output = detection_layer([feature_map1, feature_map2, feature_map3])
+            ```
+        
+        Note:
+            Ensure input tensors have consistent shapes aligned with the detection layer configuration. The function
+            reshapes, normalizes, and concatenates features from each input tensor to produce final detection outputs.
+        """
         z = []  # inference output
         x = []
         for i in range(self.nl):
@@ -420,7 +1078,21 @@ class TFDetect(keras.layers.Layer):
 
     @staticmethod
     def _make_grid(nx=20, ny=20):
-        """Generates a 2D grid of coordinates in (x, y) format with shape [1, 1, ny*nx, 2]."""
+        """
+        Generate a 2D coordinate grid for anchors with shape (1, 1, ny*nx, 2).
+        
+        Args:
+            nx (int): Number of grid anchors along the x-axis. Default is 20.
+            ny (int): Number of grid anchors along the y-axis. Default is 20.
+        
+        Returns:
+            (tf.Tensor): A tensor containing the 2D coordinate grid with shape (1, 1, ny*nx, 2).
+        
+        Example:
+            ```python
+            grid = TFDetect._make_grid(20, 20)
+            ```
+        """
         # return torch.stack((xv, yv), 2).view((1, 1, ny, nx, 2)).float()
         xv, yv = tf.meshgrid(tf.range(nx), tf.range(ny))
         return tf.cast(tf.reshape(tf.stack([xv, yv], 2), [1, 1, ny * nx, 2]), dtype=tf.float32)
@@ -429,8 +1101,32 @@ class TFDetect(keras.layers.Layer):
 class TFSegment(TFDetect):
     # YOLOv5 Segment head for segmentation models
     def __init__(self, nc=80, anchors=(), nm=32, npr=256, ch=(), imgsz=(640, 640), w=None):
-        """Initializes YOLOv5 Segment head with specified channel depths, anchors, and input size for segmentation
-        models.
+        """
+        Initialize the YOLOv5 segmentation head for TensorFlow models.
+        
+        Args:
+            nc (int): Number of classes for segmentation.
+            anchors (list[float]): List of anchor boxes used in YOLOv5, this should be an iterable containing anchor sizes.
+            nm (int): Number of segmentation masks.
+            npr (int): Number of prototypes.
+            ch (list[int]): List of input channels for each detection layer.
+            imgsz (tuple[int, int]): Image size in the format (height, width).
+            w (object): Pretrained weights for initializing the model.
+        
+        Returns:
+            None
+        
+        Example:
+            ```python
+            import tensorflow as tf
+            from your_module import TFSegment
+        
+            segmentor = TFSegment(nc=80, anchors=[[10, 13, 16, 30, 33, 23], [30, 61, 62, 45, 59, 119], [116, 90, 156, 198, 373, 326]],
+                                  nm=32, npr=256, ch=[256, 512, 1024], imgsz=(640, 640), w=weights)
+            ```
+        
+        Note:
+            The 'w' parameter is critical for performance as it utilizes pretrained weights to enhance segmentation accuracy.
         """
         super().__init__(nc, anchors, ch, imgsz, w)
         self.nm = nm  # number of masks
@@ -441,7 +1137,30 @@ class TFSegment(TFDetect):
         self.detect = TFDetect.call
 
     def call(self, x):
-        """Applies detection and proto layers on input, returning detections and optionally protos if training."""
+        """
+        Perform segmentation using the YOLOv5 segmentation head for TensorFlow models.
+        
+        Args:
+            x (list[tf.Tensor]): Input feature maps from backbone network.
+        
+        Returns:
+            (tuple[tf.Tensor]): A tuple containing:
+                - detections (tf.Tensor): Detection predictions with shape (N, num_detections, 5 + num_classes + num_masks),
+                  where N is the batch size.
+                - proto (tf.Tensor): Prototype masks with shape (N, num_prototypes, height, width).
+        
+        Example:
+            ```python
+            # Assuming 'backbone_features' is a list of TensorFlow tensors from the backbone network
+            segmentor = TFSegment(nc=80, anchors=[[10, 13, 16, 30, 33, 23], [30, 61, 62, 45, 59, 119], [116, 90, 156, 198, 373, 326]],
+                                  nm=32, npr=256, ch=[256, 512, 1024], imgsz=(640, 640), w=weights)
+            detections, proto = segmentor(backbone_features)
+            ```
+        
+        Note:
+            The method processes the input feature maps to produce object detection predictions and prototype masks used in 
+            segmentation tasks.
+        """
         p = self.proto(x[0])
         # p = TFUpsample(None, scale_factor=4, mode='nearest')(self.proto(x[0]))  # (optional) full-size protos
         p = tf.transpose(p, [0, 3, 1, 2])  # from shape(1,160,160,32) to shape(1,32,160,160)
@@ -451,8 +1170,26 @@ class TFSegment(TFDetect):
 
 class TFProto(keras.layers.Layer):
     def __init__(self, c1, c_=256, c2=32, w=None):
-        """Initializes TFProto layer with convolutional and upsampling layers for feature extraction and
-        transformation.
+        """
+        Initializes TFProto layer with convolutional and upsampling layers for feature extraction and transformation.
+        
+        Args:
+            c1 (int): Number of input channels.
+            c_ (int): Number of hidden channels, default is 256.
+            c2 (int): Number of output channels, default is 32.
+            w (object | None): Pretrained weights for initializing the convolutional layers. If None, layers are initialized
+                with default settings.
+        
+        Returns:
+            (TFProto): Instance of the TFProto layer, ready for use in a TensorFlow model.
+        
+        Example:
+            ```python
+            tf_proto_layer = TFProto(c1=128)
+            ```
+        
+        Note:
+            This layer is designed to be a part of the YOLOv5 model pipeline, specifically for segmenting image features.
         """
         super().__init__()
         self.cv1 = TFConv(c1, c_, k=3, w=w.cv1)
@@ -461,7 +1198,24 @@ class TFProto(keras.layers.Layer):
         self.cv3 = TFConv(c_, c2, w=w.cv3)
 
     def call(self, inputs):
-        """Performs forward pass through the model, applying convolutions and upscaling on input tensor."""
+        """
+        Handles forwarding through convolutional and upsampling layers to generate mask prototypes in TF models.
+        
+        Args:
+            inputs (tf.Tensor): A tensor with shape (N, H, W, C), where N is the batch size, H is the height,
+                W is the width, and C is the number of channels.
+        
+        Returns:
+            (tf.Tensor): A tensor with the transformed features, having shape (N, H_new, W_new, C2) where H_new and W_new
+                are the new height and width after processing, and C2 is the number of output channels.
+        
+        Example:
+            ```python
+            tf_proto_layer = TFProto(c1=128)
+            input_tensor = tf.random.normal([1, 64, 64, 128])
+            output_tensor = tf_proto_layer(input_tensor)
+            ```
+        """
         return self.cv3(self.cv2(self.upsample(self.cv1(inputs))))
 
 
@@ -469,10 +1223,25 @@ class TFUpsample(keras.layers.Layer):
     # TF version of torch.nn.Upsample()
     def __init__(self, size, scale_factor, mode, w=None):
         """
-        Initializes a TensorFlow upsampling layer with specified size, scale_factor, and mode, ensuring scale_factor is
-        even.
-
-        Warning: all arguments needed including 'w'
+        Initialize a TensorFlow upsampling layer.
+        
+        Args:
+            size (tuple[int] | None): Desired output size. Default is `None`.
+            scale_factor (int | None): Multiplier for the height and width of the input. Must be even. Default is `None`.
+            mode (str): Upsampling algorithm to use. Options are ('nearest', 'bilinear', etc.).
+            w (torch.nn.Module | None): Placeholder for compatibility. Default is `None`.
+        
+        Returns:
+            None
+        
+        Example:
+            ```python
+            upsample_layer = TFUpsample(size=None, scale_factor=2, mode="nearest")
+            result = upsample_layer(input_tensor)
+            ```
+        
+        Note:
+            Ensure that 'scale_factor' is a multiple of 2.
         """
         super().__init__()
         assert scale_factor % 2 == 0, "scale_factor must be multiple of 2"
@@ -483,25 +1252,130 @@ class TFUpsample(keras.layers.Layer):
         #                                                            size=(x.shape[1] * 2, x.shape[2] * 2))
 
     def call(self, inputs):
-        """Applies upsample operation to inputs using nearest neighbor interpolation."""
+        """
+        Perform nearest neighbor upsampling on input tensors using the specified scale factor and mode in TensorFlow.
+        
+        Args:
+            inputs (tf.Tensor): Input tensor to be upsampled, typically with shape (B, H, W, C) where B is the batch size,
+                H is the height, W is the width, and C is the number of channels.
+        
+        Returns:
+            (tf.Tensor): Upsampled tensor with dimensions equal to original dimensions multiplied by the scale factor.
+                The output tensor will have a shape of (B, H * scale_factor, W * scale_factor, C).
+        
+        Example:
+            ```python
+            import tensorflow as tf
+            from your_module import TFUpsample
+        
+            upsample_layer = TFUpsample(size=None, scale_factor=2, mode="nearest")
+            input_tensor = tf.random.normal([1, 64, 64, 32])
+            output_tensor = upsample_layer(input_tensor)
+            print(output_tensor.shape)  # Expected output shape: (1, 128, 128, 32)
+            ```
+        
+        Note:
+            Ensure that 'scale_factor' is a multiple of 2. This layer resizes the spatial dimensions (height and width)
+            of the input tensor by the specified scale factor.
+        """
         return self.upsample(inputs)
 
 
 class TFConcat(keras.layers.Layer):
     # TF version of torch.concat()
     def __init__(self, dimension=1, w=None):
-        """Initializes a TensorFlow layer for NCHW to NHWC concatenation, requiring dimension=1."""
+        """
+        Initializes a TensorFlow layer for concatenating tensors along the specified dimension.
+        
+        Args:
+            dimension (int, optional): The dimension along which to concatenate tensors. Default is 1, converting 
+                from NCHW to NHWC format.
+            w (torch.nn.Module | None, optional): Pretrained weights from a PyTorch model to match the dimension 
+                order. Default is None.
+        
+        Returns:
+            None
+        
+        Example:
+            ```python
+            import tensorflow as tf
+            from models.tf import TFConcat
+        
+            # Initialize the concatenate layer
+            concat_layer = TFConcat(dimension=1)
+        
+            # Concatenate two sample tensors along the specified dimension
+            tensor1 = tf.random.normal([1, 256, 256, 64])
+            tensor2 = tf.random.normal([1, 256, 256, 64])
+            output_tensor = concat_layer([tensor1, tensor2])
+            ```
+        
+        Note:
+            This class ensures compatibility between PyTorch and TensorFlow tensor formats by handling only the 
+            concatenation of NCHW (PyTorch) to NHWC (TensorFlow).
+        """
         super().__init__()
         assert dimension == 1, "convert only NCHW to NHWC concat"
         self.d = 3
 
     def call(self, inputs):
-        """Concatenates a list of tensors along the last dimension, used for NCHW to NHWC conversion."""
+        """
+        Concatenates input tensors along the last dimension, converting from NCHW to NHWC format.
+        
+        Args:
+            inputs (list[tf.Tensor]): List of input tensors in NCHW format to be concatenated.
+        
+        Returns:
+            (tf.Tensor): Concatenated tensor in NHWC format.
+        
+        Example:
+            ```python
+            concat_layer = TFConcat()
+            input1 = tf.random.normal([1, 64, 32, 32])
+            input2 = tf.random.normal([1, 64, 32, 32])
+            output = concat_layer([input1, input2])
+            ```
+        """
         return tf.concat(inputs, self.d)
 
 
 def parse_model(d, ch, model, imgsz):
-    """Parses a model definition dict `d` to create YOLOv5 model layers, including dynamic channel adjustments."""
+    """
+    Parses the model configuration dictionary to create YOLOv5 model layers with dynamic channel adjustments.
+    
+    This function processes the model configuration, initializing layers and setting up the neural network architecture
+    for YOLOv5 model training and inference in TensorFlow.
+    
+    Args:
+        d (dict): Model configuration dictionary containing anchor boxes, class counts, depth multiple, width multiple, 
+            and backbone/head layer details.
+        ch (list[int]): List of input channels for each layer.
+        model (object): Existing model object containing pretrained weights and other parameters.
+        imgsz (tuple[int, int]): Input image size as (height, width).
+    
+    Returns:
+        (list[keras.Sequential]): Parsed list of keras.Sequential model layers set up for YOLOv5, with adjusted channels.
+    
+    Example:
+        ```python
+        model_config = {
+            'anchors': [[116, 90, 156, 198, 373, 326], [30, 61, 62, 45, 59, 119], [10, 13, 16, 30, 33, 23]],
+            'nc': 80, 'depth_multiple': 0.33, 'width_multiple': 0.50,
+            'backbone': [
+                [-1, 1, 'Conv', [32, 3, 1]],
+                [-1, 1, 'C3', [64, 3, 0.5]],
+            ],
+            'head': [
+                [-1, 1, 'SPPF', [256, 5]],
+            ],
+        }
+        input_channels = [3]
+        parsed_model_layers = parse_model(model_config, input_channels, pretrained_model, (640, 640))
+        ```
+    
+    Note:
+        This function converts PyTorch model layers to TensorFlow Keras layers, maintaining parameter consistency.
+    """
     LOGGER.info(f"\n{'':>3}{'from':>18}{'n':>3}{'params':>10}  {'module':<40}{'arguments':<30}")
     anchors, nc, gd, gw, ch_mul = (
         d["anchors"],
@@ -583,8 +1457,28 @@ def parse_model(d, ch, model, imgsz):
 class TFModel:
     # TF YOLOv5 model
     def __init__(self, cfg="yolov5s.yaml", ch=3, nc=None, model=None, imgsz=(640, 640)):
-        """Initializes TF YOLOv5 model with specified configuration, channels, classes, model instance, and input
-        size.
+        """
+        Initialize a TensorFlow YOLOv5 model with specified configuration, input channels, and classes.
+        
+        Args:
+            cfg (str | dict): Model configuration, either a file path to a yaml file or a dictionary containing network
+                structure and parameters.
+            ch (int): Number of input channels.
+            nc (int | None): Number of classes for detection tasks.
+            model (torch.nn.Module | None): PyTorch model instance to map to TensorFlow model structure.
+            imgsz (tuple[int, int]): Input image size as a tuple of (height, width).
+        
+        Returns:
+            None
+        
+        Example:
+            ```python
+            tf_model = TFModel(cfg='yolov5s.yaml', ch=3, nc=80, imgsz=(640, 640))
+            ```
+        
+        Note:
+            This model supports YOLOv5 architectures and is compatible with TensorFlow and Keras frameworks.
+            Ensure to provide properly formatted configuration files or dictionaries for successful model initialization.
         """
         super().__init__()
         if isinstance(cfg, dict):
@@ -612,7 +1506,38 @@ class TFModel:
         iou_thres=0.45,
         conf_thres=0.25,
     ):
-        """Runs inference on input data, with an option for TensorFlow NMS."""
+        """
+        Perform prediction on input data using the TensorFlow YOLOv5 model, optionally applying non-max suppression.
+        
+        Args:
+            inputs (tf.Tensor): Input tensor containing the image data, shape (B, H, W, C).
+            tf_nms (bool): Apply TensorFlow non-max suppression after prediction. Default is False.
+            agnostic_nms (bool): Class-agnostic non-max suppression. Default is False.
+            topk_per_class (int): Top-K maximum detections per class. Default is 100.
+            topk_all (int): Top-K maximum total detections. Default is 100.
+            iou_thres (float): Intersection-over-union (IoU) threshold for NMS. Default is 0.45.
+            conf_thres (float): Confidence score threshold for filtering predictions. Default is 0.25.
+        
+        Returns:
+            (tuple[tf.Tensor]): Tuple containing predicted bounding boxes, confidence scores, and class probabilities. 
+                If `tf_nms` is True, returns the results of TensorFlow NMS, shape (N, 7) where N is the number of predictions 
+                with columns for (x1, y1, x2, y2, score, class). If `tf_nms` is False, returns the raw tensor output from the model.
+        
+        Example:
+            ```python
+            import tensorflow as tf
+            from ultralytics import TFModel
+        
+            # Initialize the model
+            model = TFModel(cfg='yolov5s.yaml', ch=3, nc=80)
+        
+            # Prepare input tensor
+            img = tf.random.normal([1, 640, 640, 3])
+        
+            # Perform prediction
+            predictions = model.predict(img, tf_nms=True)
+            ```
+        """
         y = []  # outputs
         x = inputs
         for m in self.model.layers:
@@ -645,8 +1570,25 @@ class TFModel:
 
     @staticmethod
     def _xywh2xyxy(xywh):
-        """Converts bounding box format from [x, y, w, h] to [x1, y1, x2, y2], where xy1=top-left and xy2=bottom-
-        right.
+        """
+        Convert bounding box format from [x, y, w, h] to [x1, y1, x2, y2].
+        
+        Args:
+            xywh (torch.Tensor): Bounding boxes in the format (x, y, w, h) with shape (N, 4) where N is the number of boxes.
+        
+        Returns:
+            (torch.Tensor): Bounding boxes in the format (x1, y1, x2, y2) with shape (N, 4), where x1, y1 are top-left coordinates,
+            and x2, y2 are bottom-right coordinates.
+        
+        Notes:
+            This method is useful for converting bounding box formats for various operations like plotting, Non-Maximum 
+            Suppression (NMS), or further model predictions.
+        
+        Examples:
+            ```python
+            boxes_xywh = torch.Tensor([[50, 50, 100, 100], [30, 40, 120, 80]])
+            boxes_xyxy = TFModel._xywh2xyxy(boxes_xywh)
+            ```
         """
         x, y, w, h = tf.split(xywh, num_or_size_splits=4, axis=-1)
         return tf.concat([x - w / 2, y - h / 2, x + w / 2, y + h / 2], axis=-1)
@@ -655,7 +1597,40 @@ class TFModel:
 class AgnosticNMS(keras.layers.Layer):
     # TF Agnostic NMS
     def call(self, input, topk_all, iou_thres, conf_thres):
-        """Performs agnostic NMS on input tensors using given thresholds and top-K selection."""
+        """
+        Perform class-agnostic non-maximum suppression (NMS) on input bounding boxes.
+        
+        Args:
+            input (tuple[tf.Tensor, tf.Tensor, tf.Tensor]): Tuple containing:
+                boxes (tf.Tensor): Bounding boxes with shape (N, 4), where N is the number of boxes.
+                classes (tf.Tensor): Class predictions with shape (N,), where N is the number of boxes.
+                scores (tf.Tensor): Confidence scores with shape (N, C), where C is the number of classes.
+            topk_all (int): Maximum number of final boxes to keep after non-max suppression.
+            iou_thres (float): Intersection over union (IoU) threshold for NMS.
+            conf_thres (float): Confidence threshold to filter boxes before NMS.
+        
+        Returns:
+            (tuple[tf.Tensor, tf.Tensor, tf.Tensor, tf.Tensor]): Four tensors containing:
+                boxes (tf.Tensor): Filtered bounding boxes after NMS, with shape (M, 4), where M is the number of kept boxes.
+                scores (tf.Tensor): Scores of kept boxes, with shape (M,).
+                classes (tf.Tensor): Class indices of kept boxes, with shape (M,).
+                indices (tf.Tensor): Original indices of the kept boxes, with shape (M,).
+        
+        Example:
+            ```python
+            boxes = tf.random.uniform((100, 4), minval=0, maxval=640)
+            scores = tf.random.uniform((100, 80), minval=0, maxval=1)
+            classes = tf.argmax(scores, axis=1)
+            input = (boxes, classes, scores)
+            nms_layer = AgnosticNMS()
+            final_boxes, final_scores, final_classes, final_indices = nms_layer(
+                input, topk_all=20, iou_thres=0.5, conf_thres=0.25)
+            ```
+        
+        Note:
+            The function is designed to operate on single-anchor format inputs, performing class-agnostic NMS to reduce redundancy 
+            in detected bounding boxes.
+        """
         return tf.map_fn(
             lambda x: self._nms(x, topk_all, iou_thres, conf_thres),
             input,
@@ -665,8 +1640,39 @@ class AgnosticNMS(keras.layers.Layer):
 
     @staticmethod
     def _nms(x, topk_all=100, iou_thres=0.45, conf_thres=0.25):
-        """Performs agnostic non-maximum suppression (NMS) on detected objects, filtering based on IoU and confidence
-        thresholds.
+        """
+        Perform agnostic non-maximum suppression on given bounding box predictions.
+        
+        Args:
+            input (tuple[torch.Tensor, torch.Tensor, torch.Tensor]): A tuple containing bounding boxes (N, 4), classes (N, C) 
+                where C is the number of classes, and scores (N, C) where N is the number of predictions.
+            topk_all (int): The maximum number of detections to keep.
+            iou_thres (float): Intersection over Union (IoU) threshold for NMS.
+            conf_thres (float): Confidence threshold for filtering low-confidence predictions.
+        
+        Returns:
+            tuple: A tuple containing:
+                - torch.Tensor: Padded bounding boxes with shape (topk_all, 4).
+                - torch.Tensor: Padded scores with shape (topk_all).
+                - torch.Tensor: Padded class indices with shape (topk_all).
+                - int: Number of valid detections.
+        
+        Example:
+            ```python
+            boxes = torch.rand(100, 4)
+            classes = torch.rand(100, 5)
+            scores = torch.rand(100, 5)
+            topk_all = 50
+            iou_thres = 0.5
+            conf_thres = 0.3
+        
+            selected_boxes, padded_scores, selected_classes, valid_detections = AgnosticNMS._nms(
+                (boxes, classes, scores), topk_all, iou_thres, conf_thres)
+            ```
+        
+        Note:
+            This function considers detections class-agnostic and clusters all predicted boxes without regard to class labels 
+            during NMS.
         """
         boxes, classes, scores = x
         class_inds = tf.cast(tf.argmax(classes, axis=-1), tf.float32)
@@ -700,7 +1706,25 @@ class AgnosticNMS(keras.layers.Layer):
 
 
 def activations(act=nn.SiLU):
-    """Converts PyTorch activations to TensorFlow equivalents, supporting LeakyReLU, Hardswish, and SiLU/Swish."""
+    """
+    Convert PyTorch activation functions to their TensorFlow equivalents.
+    
+    Args:
+        act (type[torch.nn.Module], optional): Activation function from PyTorch. Default is nn.SiLU.
+    
+    Returns:
+        (callable): A TensorFlow-compatible activation function.
+    
+    Example:
+        ```python
+        tf_activation = activations(nn.LeakyReLU)
+        output = tf_activation(input_tensor)
+        ```
+    
+    Note:
+        Supports the conversion of LeakyReLU, Hardswish, and SiLU (Swish) activation functions. For unsupported types, 
+        raises an error.
+    """
     if isinstance(act, nn.LeakyReLU):
         return lambda x: keras.activations.relu(x, alpha=0.1)
     elif isinstance(act, nn.Hardswish):
@@ -712,8 +1736,29 @@ def activations(act=nn.SiLU):
 
 
 def representative_dataset_gen(dataset, ncalib=100):
-    """Generates a representative dataset for calibration by yielding transformed numpy arrays from the input
-    dataset.
+    """
+    Generates a representative dataset for calibration by yielding transformed numpy arrays from the input dataset.
+    
+    Args:
+        dataset (iterable): Dataset to yield images for calibration. Each item in the dataset should be a tuple containing
+            (path, img, im0s, vid_cap, string), where 'img' is the image represented as a numpy array with shape (C, H, W).
+        ncalib (int): Number of samples to yield for calibration (default is 100).
+    
+    Returns:
+        (generator): A generator yielding a list of numpy arrays, each representing an image with shape (1, H, W, C) scaled and
+            preprocessed for model calibration.
+    
+    Example:
+        ```python
+        dataset = DataLoader(...)  # define your dataset
+        data_gen = representative_dataset_gen(dataset, ncalib=50)
+        for calibration_data in data_gen:
+            # perform calibration
+        ```
+    
+    Notes:
+        - The function stops yielding data once ncalib samples have been produced from the dataset.
+        - Images are converted from shape (C, H, W) to (1, H, W, C) and scaled to a range of [0, 1].
     """
     for n, (path, img, im0s, vid_cap, string) in enumerate(dataset):
         im = np.transpose(img, [1, 2, 0])
@@ -731,7 +1776,27 @@ def run(
     dynamic=False,  # dynamic batch size
 ):
     # PyTorch model
-    """Exports YOLOv5 model from PyTorch to TensorFlow and Keras formats, performing inference for validation."""
+    """
+    Exports YOLOv5 model from PyTorch to TensorFlow and Keras formats, performing inference for validation.
+    
+    Args:
+        weights (str | pathlib.Path): Path to the weights file. Default is ROOT / "yolov5s.pt".
+        imgsz (tuple[int, int]): Tuple of integers representing the height and width of the image for inference. Default is (640, 640).
+        batch_size (int): Size of the batch for inference. Default is 1.
+        dynamic (bool): Flag to indicate if dynamic batch size should be used in Keras model. Default is False.
+    
+    Returns:
+        None: The function exports the model and performs inference without returning any value.
+    
+    Example:
+        ```python
+        run(weights='best.pt', imgsz=(640, 640), batch_size=1, dynamic=False)
+        ```
+    
+    Note:
+        - Ensure you have the necessary dependencies installed (`torch`, `tensorflow`, `keras`).
+        - Adjust the `weights` path, `imgsz`, `batch_size`, and `dynamic` flag as needed for your setup.
+    """
     im = torch.zeros((batch_size, 3, *imgsz))  # BCHW image
     model = attempt_load(weights, device=torch.device("cpu"), inplace=True, fuse=False)
     _ = model(im)  # inference
@@ -751,8 +1816,34 @@ def run(
 
 
 def parse_opt():
-    """Parses and returns command-line options for model inference, including weights path, image size, batch size, and
-    dynamic batching.
+    """
+    Parse command-line arguments for model inference configuration.
+    
+    This utility function parses command-line arguments to configure the inference properties such as paths to weight files,
+    image sizes, batch sizes, and dynamic batch size options.
+    
+    Args:
+        None
+    
+    Returns:
+        (argparse.Namespace): Namespace object containing parsed command-line options:
+            - weights (str): Path to the model weights.
+            - imgsz (list[int]): Inference image size (height, width).
+            - batch_size (int): Batch size for inference.
+            - dynamic (bool): Whether to use dynamic batch size.
+    
+    Example:
+        ```python
+        opt = parse_opt()
+        print(opt.weights)
+        print(opt.imgsz)
+        print(opt.batch_size)
+        print(opt.dynamic)
+        ```
+    
+    Note:
+        The --imgsz argument accepts either a single integer or a tuple of two integers. If only one value is provided,
+        it will be duplicated to form a square shape (height, width).
     """
     parser = argparse.ArgumentParser()
     parser.add_argument("--weights", type=str, default=ROOT / "yolov5s.pt", help="weights path")
@@ -766,7 +1857,28 @@ def parse_opt():
 
 
 def main(opt):
-    """Executes the YOLOv5 model run function with parsed command line options."""
+    """
+    Execute the main function to run model export and validation processes for YOLOv5, including conversion to TensorFlow 
+    and Keras formats.
+    
+    Args:
+        opt (argparse.Namespace): Parsed command-line arguments which include:
+            - weights (str): Path to the model weights.
+            - imgsz (list[int]): Inference image size (height, width).
+            - batch_size (int): Batch size for inference.
+            - dynamic (bool): Whether to use dynamic batch size.
+    
+    Example:
+        ```python
+        if __name__ == "__main__":
+            opt = parse_opt()
+            main(opt)
+        ```
+    
+    Note:
+        This function integrates and validates the conversion of YOLOv5 from PyTorch to TensorFlow and Keras frameworks.
+        For additional export options, refer to the export.py script.
+    """
     run(**vars(opt))
 
 
diff --git a/models/yolo.py b/models/yolo.py
index d89c5da01..1d8da8273 100644
--- a/models/yolo.py
+++ b/models/yolo.py
@@ -76,7 +76,27 @@ class Detect(nn.Module):
     export = False  # export mode
 
     def __init__(self, nc=80, anchors=(), ch=(), inplace=True):
-        """Initializes YOLOv5 detection layer with specified classes, anchors, channels, and inplace operations."""
+        """
+        Initializes the YOLOv5 Detect layer with class count, anchors, channels, and inplace operations.
+        
+        Args:
+            nc (int, optional): Number of classes. Default is 80.
+            anchors (tuple, optional): Anchor box dimensions, typically specified for each detection layer. Default is ().
+            ch (tuple, optional): Number of input channels for each detection layer. Default is ().
+            inplace (bool, optional): If True, operations are done inplace. Default is True.
+        
+        Returns:
+            None
+        
+        Example:
+            ```python
+            detect_layer = Detect(nc=80, anchors=(), ch=(256, 512, 1024), inplace=True)
+            ```
+        
+        Note:
+            This function initializes detection heads in the YOLOv5 model, setting up convolution layers, grids, and 
+            anchor grids required for object detection inference.
+        """
         super().__init__()
         self.nc = nc  # number of classes
         self.no = nc + 5  # number of outputs per anchor
@@ -89,7 +109,23 @@ class Detect(nn.Module):
         self.inplace = inplace  # use inplace ops (e.g. slice assignment)
 
     def forward(self, x):
-        """Processes input through YOLOv5 layers, altering shape for detection: `x(bs, 3, ny, nx, 85)`."""
+        """
+        Processes input through detection layers, reshaping and applying convolution for YOLOv5 inference.
+        
+        Args:
+            x (list[torch.Tensor]): List of feature maps from backbone with shape (B, C, H, W) where B is the batch 
+                size, C is the number of channels, and H and W are height and width.
+        
+        Returns:
+            (list[torch.Tensor]): List of processed detections, each a torch Tensor with shape (B, N, D) where B 
+                is the batch size, N is the number of detections, and D is the dimensions of each detection 
+                (e.g., bounding box coordinates, objectness score, class probabilities).
+        
+        Note:
+            This method applies a series of convolutions to transform the input feature maps into detection 
+            outputs. It also handles reshaping and permutation to align with YOLOv5's output format. During 
+            inference, additional steps are performed to compute final object locations and dimensions.
+        """
         z = []  # inference output
         for i in range(self.nl):
             x[i] = self.m[i](x[i])  # conv
@@ -115,7 +151,29 @@ class Detect(nn.Module):
         return x if self.training else (torch.cat(z, 1),) if self.export else (torch.cat(z, 1), x)
 
     def _make_grid(self, nx=20, ny=20, i=0, torch_1_10=check_version(torch.__version__, "1.10.0")):
-        """Generates a mesh grid for anchor boxes with optional compatibility for torch versions < 1.10."""
+        """
+        Generate a mesh grid for anchor boxes with torch version compatibility for detection models.
+        
+        Args:
+            nx (int): Number of grid cells along the x-axis.
+            ny (int): Number of grid cells along the y-axis.
+            i (int): Index of the detection layer for which the grid is being generated.
+            torch_1_10 (bool): Indicator whether the torch version is at least 1.10.0 for meshgrid compatibility.
+        
+        Returns:
+            (tuple[torch.Tensor, torch.Tensor]): A tuple containing two tensors:
+                - grid (torch.Tensor): The generated grid with shape (1, num_anchors, ny, nx, 2), containing xy coordinates.
+                - anchor_grid (torch.Tensor): The anchor grid scaled by the stride, with shape (1, num_anchors, ny, nx, 2).
+        
+        Example:
+            ```python
+            detector = Detect()
+            grid, anchor_grid = detector._make_grid(20, 20, 0)
+            ```
+        
+        Note:
+            The function ensures compatibility with different torch versions by using appropriate meshgrid indexing options.
+        """
         d = self.anchors[i].device
         t = self.anchors[i].dtype
         shape = 1, self.na, ny, nx, 2  # grid shape
@@ -129,7 +187,25 @@ class Detect(nn.Module):
 class Segment(Detect):
     # YOLOv5 Segment head for segmentation models
     def __init__(self, nc=80, anchors=(), nm=32, npr=256, ch=(), inplace=True):
-        """Initializes YOLOv5 Segment head with options for mask count, protos, and channel adjustments."""
+        """
+        Initializes YOLOv5 Segment head with parameters for masks, prototypes, class count, anchors, and channels.
+        
+        Args:
+            nc (int): Number of classes for the segmentation model (default is 80).
+            anchors (tuple): Tuple of anchor box dimensions for the segmentation model.
+            nm (int): Number of masks for the segmentation (default is 32).
+            npr (int): Number of prototypes for the masks (default is 256).
+            ch (tuple): Tuple of input channels for each detection layer.
+            inplace (bool): If True, use in-place operations for layer computations (default is True).
+        
+        Returns:
+            None
+        
+        Example:
+            ```python
+            segment_head = Segment(nc=80, anchors=anchors, nm=32, npr=256, ch=[512, 256, 128], inplace=True)
+            ```
+        """
         super().__init__(nc, anchors, ch, inplace)
         self.nm = nm  # number of masks
         self.npr = npr  # number of protos
@@ -139,8 +215,38 @@ class Segment(Detect):
         self.detect = Detect.forward
 
     def forward(self, x):
-        """Processes input through the network, returning detections and prototypes; adjusts output based on
-        training/export mode.
+        """
+        Processes input through the network, returning detections and prototypes.
+        
+        Args:
+            x (list[torch.Tensor]): List of input tensors corresponding to different detection layers, each with shape
+                (B, C, H, W), where B is batch size, C is number of channels, H and W are height and width.
+        
+        Returns:
+            (tuple[torch.Tensor, torch.Tensor]): A tuple containing:
+                - `detection` (torch.Tensor): The detection output tensor with shape (B, N, 85), where B is batch size, N is
+                  the number of detections.
+                - `prototypes` (torch.Tensor): The prototype masks tensor produced by the network with shape (B, P, H', W'),
+                  where B is batch size, P is the number of prototypes, and H' and W' correspond to height and width.
+        
+         Example:
+            ```python
+            import torch
+            from ultralytics import YOLOv5
+        
+            # Initialize model
+            model = YOLOv5.Segment()
+        
+            # Generate dummy input
+            x = [torch.randn(1, 3, 640, 640) for _ in range(3)]
+        
+            # Forward pass
+            detection, prototypes = model.forward(x)
+            ```
+        
+        Note:
+            During inference (evaluation mode), detection outputs are post-processed to generate final bounding boxes and classes.
+            In training mode, the outputs are not processed.
         """
         p = self.proto(x[0])
         x = self.detect(self, x)
@@ -151,13 +257,64 @@ class BaseModel(nn.Module):
     """YOLOv5 base model."""
 
     def forward(self, x, profile=False, visualize=False):
-        """Executes a single-scale inference or training pass on the YOLOv5 base model, with options for profiling and
-        visualization.
+        """
+        Perform a forward pass through the YOLOv5 model, optionally profiling and visualizing features.
+        
+        Args:
+            x (torch.Tensor): Input data tensor with shape (N, C, H, W).
+            profile (bool): Whether to profile execution time of each layer. Defaults to False.
+            visualize (bool): Whether to store and visualize feature maps. Defaults to False.
+        
+        Returns:
+            (torch.Tensor | tuple): In training mode, returns predictions as tuples with shapes (N, 3, H, W, no). 
+            In inference mode, returns a single tensor with shape (N, M, no), where M is the number of predicted 
+            objects after non-maximum suppression (NMS).
+        
+        Example:
+            ```python
+            model = BaseModel()
+            input_tensor = torch.randn(1, 3, 640, 640)
+            output = model.forward(input_tensor, profile=True, visualize=True)
+            ```
+        
+        Note:
+            - In training mode, the method returns unprocessed predictions for each scale, suitable for loss calculation.
+            - In inference mode, non-maximum suppression is applied to refine predictions.
         """
         return self._forward_once(x, profile, visualize)  # single-scale inference, train
 
     def _forward_once(self, x, profile=False, visualize=False):
-        """Performs a forward pass on the YOLOv5 model, enabling profiling and feature visualization options."""
+        """
+        Execute a forward pass through the YOLOv5 model layers with optional profiling and visualization.
+        
+        Args:
+            x (torch.Tensor): Input tensor with shape (N, C, H, W), where N is the batch size, C is the number 
+                of channels, and H and W are the height and width of the input image, respectively.
+            profile (bool): If True, profiles the execution time for each layer. Defaults to False.
+            visualize (bool): If True, stores and visualizes feature maps. Defaults to False.
+        
+        Returns:
+            (torch.Tensor): Model output tensor with shape depending on whether the model is in training or 
+            inference mode.
+                - In training mode: Returns a list of tensors for each detection layer, each tensor has shape 
+                  (N, 3, H, W, no), where `no` is the number of outputs per anchor.
+                - In inference mode: If not exporting, returns a tuple with a single tensor of shape (N, M, no), 
+                  where M is the number of predicted objects.
+                - If exporting: Returns a tensor of shape (N, M, no).
+        
+        Example:
+            ```python
+            model = BaseModel()
+            input_tensor = torch.randn(1, 3, 640, 640)  # Generate a random input tensor
+            output = model._forward_once(input_tensor, profile=True, visualize=True)
+            ```
+        
+        Note:
+            This method conducts a single-scale inference or training pass through the model. Depending on the mode 
+            (training or inference), the method behaves differently. In training mode, it returns unprocessed 
+            predictions for each detection layer. In inference mode, non-maximum suppression (NMS) is typically 
+            applied after this method to refine predictions.
+        """
         y, dt = [], []  # outputs
         for m in self.model:
             if m.f != -1:  # if not from previous layer
@@ -171,7 +328,32 @@ class BaseModel(nn.Module):
         return x
 
     def _profile_one_layer(self, m, x, dt):
-        """Profiles a single layer's performance by computing GFLOPs, execution time, and parameters."""
+        """
+        Profiles a single model layer's GFLOPs, parameters, and execution time within the YOLOv5 model.
+        
+        Args:
+            m (nn.Module): The model layer to be profiled.
+            x (torch.Tensor): Input tensor passed to the model layer, with shape (N, C, H, W).
+            dt (list[float]): List to record execution times of the profiled layer.
+        
+        Returns:
+            None: The function updates the `dt` list with the execution time of the layer in milliseconds.
+        
+        Example:
+            ```python
+            model = BaseModel()
+            layer = nn.Conv2d(3, 16, 3, 1)  # Example layer
+            input_tensor = torch.randn(1, 3, 640, 640)  # Example input
+            execution_times = []
+            
+            model._profile_one_layer(layer, input_tensor, execution_times)
+            ```
+        
+        Note:
+            - Profiling is done for the purpose of understanding the computational load (GFLOPs) and time taken per layer within
+              the YOLOv5 model.
+            - If the `thop` library is not available, FLOPs computation will not be performed.
+        """
         c = m == self.model[-1]  # is final layer, copy input as inplace fix
         o = thop.profile(m, inputs=(x.copy() if c else x,), verbose=False)[0] / 1e9 * 2 if thop else 0  # FLOPs
         t = time_sync()
@@ -185,7 +367,25 @@ class BaseModel(nn.Module):
             LOGGER.info(f"{sum(dt):10.2f} {'-':>10s} {'-':>10s}  Total")
 
     def fuse(self):
-        """Fuses Conv2d() and BatchNorm2d() layers in the model to improve inference speed."""
+        """
+        Fuses Conv2d and BatchNorm2d layers in the model to optimize inference speed.
+        
+        This method modifies the model in place by merging Conv2d and BatchNorm2d layers into single Conv2d
+        layers where applicable. This can significantly improve inference speed and reduce memory usage.
+        
+        Returns:
+            None
+        
+        Example:
+            ```python
+            model = BaseModel()
+            model.fuse()
+            ```
+        
+        Note:
+            After fusing layers, the forward method of fused layers is updated to `forward_fuse`, optimizing 
+            the execution path.
+        """
         LOGGER.info("Fusing layers... ")
         for m in self.model.modules():
             if isinstance(m, (Conv, DWConv)) and hasattr(m, "bn"):
@@ -196,12 +396,44 @@ class BaseModel(nn.Module):
         return self
 
     def info(self, verbose=False, img_size=640):
-        """Prints model information given verbosity and image size, e.g., `info(verbose=True, img_size=640)`."""
+        """
+        Display model summary, including layer details and computational complexity for a specified image size.
+        
+        Args:
+            verbose (bool): If True, prints a detailed summary including information about each layer. Defaults to False.
+            img_size (int | tuple[int]): Size of the input image as an integer (for square images) or tuple (H, W). 
+                Defaults to 640.
+        
+        Returns:
+            (None): This function does not return any value. It directly prints the model summary to the console.
+        
+        Example:
+            ```python
+            model = BaseModel()
+            model.info(verbose=True, img_size=640)
+            ```
+        
+        Note:
+            Ensure that the `verbose` parameter is set to True for a comprehensive layer-by-layer summary. The image size should
+            be supplied based on the expected input size for the model.
+        """
         model_info(self, verbose, img_size)
 
     def _apply(self, fn):
-        """Applies transformations like to(), cpu(), cuda(), half() to model tensors excluding parameters or registered
-        buffers.
+        """
+        Apply a function to the model and its layer parameters, including specific modifications for Detect and Segment layers.
+        
+        Args:
+            fn (function): A function to apply to the model's tensors.
+        
+        Returns:
+            (torch.nn.Module): The module with applied transformations.
+        
+        Note:
+            The function is particularly useful for operations like converting tensors to a target device
+            (e.g., CUDA, CPU) or altering their precision (e.g., float16). The Detect layer's stride and grid
+            parameters, as well as the Segment layer's anchor grids, are specifically modified to ensure consistency
+            after such transformations.
         """
         self = super()._apply(fn)
         m = self.model[-1]  # Detect()
@@ -216,7 +448,36 @@ class BaseModel(nn.Module):
 class DetectionModel(BaseModel):
     # YOLOv5 detection model
     def __init__(self, cfg="yolov5s.yaml", ch=3, nc=None, anchors=None):
-        """Initializes YOLOv5 model with configuration file, input channels, number of classes, and custom anchors."""
+        """
+        Initializes YOLOv5 model using the specified config, input channels, class count, and custom anchors.
+        
+        Args:
+            cfg (str | dict): Model configuration, either a path to a YAML config file or a configuration dictionary.
+            ch (int): Number of input channels. Defaults to 3.
+            nc (int | None): Number of classes. If provided, overrides the value in the YAML file/config dictionary. Defaults to None.
+            anchors (list[int] | None): Custom anchors. If provided, overrides the anchors defined in the YAML file/config
+                dictionary. Defaults to None.
+        
+        Returns:
+            None
+        
+        Example:
+            ```python
+            from ultralytics.models.yolo import DetectionModel
+        
+            # Initialize model with path to YAML config
+            model1 = DetectionModel(cfg="yolov5s.yaml")
+        
+            # Initialize model with configuration dictionary
+            cfg_dict = {"nc": 80, "depth_multiple": 0.33, "width_multiple": 0.50}
+            model2 = DetectionModel(cfg=cfg_dict, ch=3, nc=80)
+            ```
+        
+        Note:
+            If `cfg` is a dictionary, it should include the necessary parameters such as `nc`, `depth_multiple`, and `width_multiple`.
+            During initialization, the model configuration from the YAML file or dictionary is parsed, and the internal model
+            structure is built accordingly. This includes defining the detection layers and adjusting anchors and strides.
+        """
         super().__init__()
         if isinstance(cfg, dict):
             self.yaml = cfg  # model dict
@@ -261,13 +522,64 @@ class DetectionModel(BaseModel):
         LOGGER.info("")
 
     def forward(self, x, augment=False, profile=False, visualize=False):
-        """Performs single-scale or augmented inference and may include profiling or visualization."""
+        """
+        Perform forward pass through the YOLOv5 detection model for training or inference, with options for augmentation,
+        profiling, and visualization.
+        
+        Args:
+            x (torch.Tensor): Input tensor with a shape of (N, C, H, W), where N is the batch size, C is the number of channels,
+                H is the height, and W is the width.
+            augment (bool): If True, performs augmented inference. Defaults to False.
+            profile (bool): If True, profiles the execution time of each layer. Defaults to False.
+            visualize (bool): If True, stores and visualizes feature maps. Defaults to False.
+        
+        Returns:
+            (torch.Tensor | tuple): Depending on the mode, returns either:
+                - In training mode: tuple containing predictions for each scale with shapes (N, 3, H, W, no).
+                - In inference mode: tensor with shape (N, M, no), where M is the number of predicted objects after
+                  non-maximum suppression.
+                - When exporting: tuple containing concatenated inference output tensor and intermediate feature maps.
+        
+        Example:
+            ```python
+            model = DetectionModel(cfg="yolov5s.yaml", ch=3, nc=80)
+            input_tensor = torch.randn(1, 3, 640, 640)
+            output = model.forward(input_tensor, augment=False, profile=True, visualize=False)
+            ```
+        
+        Note:
+            This method adapts to training and inference modes, with different return types based on the operational mode.
+            During training mode, it returns raw predictions across various scales for loss calculation, whereas in inference
+            mode, non-maximum suppression (NMS) is applied to refine predictions.
+        """
         if augment:
             return self._forward_augment(x)  # augmented inference, None
         return self._forward_once(x, profile, visualize)  # single-scale inference, train
 
     def _forward_augment(self, x):
-        """Performs augmented inference across different scales and flips, returning combined detections."""
+        """
+        Performs augmented inference by processing input across different scales and flips, merging the outputs.
+        
+        Args:
+            x (torch.Tensor): Input tensor with shape (N, C, H, W), where N is batch size, C is number of channels, 
+                H and W are height and width.
+        
+        Returns:
+            (torch.Tensor): Merged output tensor after multi-scale and flip augmentations, with shape (N, M, no), 
+                where N is batch size, M is the number of predictions, and no is the number of output features.
+        
+        Example:
+            ```python
+            model = DetectionModel(cfg='yolov5s.yaml')
+            input_tensor = torch.randn(1, 3, 640, 640)
+            output = model._forward_augment(input_tensor)
+            ```
+        
+        Note:
+            The function processes the input using different scales (1, 0.83, 0.67) and flips (None, horizontal), 
+            descaling predictions before merging. This helps to improve model robustness and accuracy 
+            during inference.
+        """
         img_size = x.shape[-2:]  # height, width
         s = [1, 0.83, 0.67]  # scales
         f = [None, 3, None]  # flips (2-ud, 3-lr)
@@ -282,7 +594,23 @@ class DetectionModel(BaseModel):
         return torch.cat(y, 1), None  # augmented inference, train
 
     def _descale_pred(self, p, flips, scale, img_size):
-        """De-scales predictions from augmented inference, adjusting for flips and image size."""
+        """
+        Adjusts predictions for augmented inference by de-scaling and correcting for flips or image size changes.
+        
+        Args:
+            p (torch.Tensor): Predictions tensor with shape (..., N) where N indicates prediction attributes like
+                bounding box coordinates, confidence score, etc.
+            flips (int | None): Specifies flip mode. `2` for vertical flip, `3` for horizontal flip, and `None` for no flip.
+            scale (float): Scale factor used during augmentation.
+            img_size (tuple[int, int]): Original image dimensions as (height, width).
+        
+        Returns:
+            (torch.Tensor): Adjusted predictions tensor with the same shape as input, de-scaled and de-flipped appropriately.
+        
+        Note:
+            If inplace operations are enabled, the adjustments are applied directly on the tensor. Otherwise, new tensors are
+            created for the adjusted values to avoid modifying the original input.
+        """
         if self.inplace:
             p[..., :4] /= scale  # de-scale
             if flips == 2:
@@ -299,8 +627,18 @@ class DetectionModel(BaseModel):
         return p
 
     def _clip_augmented(self, y):
-        """Clips augmented inference tails for YOLOv5 models, affecting first and last tensors based on grid points and
-        layer counts.
+        """
+        Clip augmented inference tails for YOLOv5 models, adjusting predictions from the first and last layers.
+        
+        Args:
+            y (list[torch.Tensor]): List of tensors, where each tensor represents detections from augmented inference across different layers.
+        
+        Returns:
+            (list[torch.Tensor]): Modified list of tensors with clipped augmented inference tails.
+        
+        Notes:
+            This function helps to discard the augmented tails by adjusting predictions from the first and last layers,
+            which might otherwise introduce artifacts due to the augmentation process.
         """
         nl = self.model[-1].nl  # number of detection layers (P3-P5)
         g = sum(4**x for x in range(nl))  # grid points
@@ -313,9 +651,36 @@ class DetectionModel(BaseModel):
 
     def _initialize_biases(self, cf=None):
         """
-        Initializes biases for YOLOv5's Detect() module, optionally using class frequencies (cf).
-
-        For details see https://arxiv.org/abs/1708.02002 section 3.3.
+        Initialize biases for the YOLOv5 Detect module using specified or default bias adjustments.
+        
+        Args:
+            cf (torch.Tensor | None): Optional tensor representing class frequencies for bias initialization. The shape should be 
+                (N,), where N is the number of classes. If not provided, default adjustments are applied based on the number of 
+                classes and image dimensions.
+        
+        Returns:
+            (torch.Tensor): Updated biases for the model with shape (N, M), where N is the number of anchors and M is the number of 
+                outputs per anchor.
+        
+        Note:
+            The function calculates the biases based on principles from https://arxiv.org/abs/1708.02002, section 3.3. If class 
+            frequencies (`cf`) are not provided, default bias adjustments are made. Adjustments primarily ensure that objectness and 
+            class biases are reasonably initialized for effective training.
+        
+        Example:
+            ```python
+            from ultralytics.yolov5 import DetectionModel
+            import torch
+        
+            # Initialize model
+            model = DetectionModel(cfg="yolov5s.yaml")
+        
+            # Optional class frequencies tensor
+            class_frequencies = torch.tensor([100, 150, 200])
+        
+            # Initialize biases
+            model._initialize_biases(cf=class_frequencies)
+            ```
         """
         # cf = torch.bincount(torch.tensor(np.concatenate(dataset.labels, 0)[:, 0]).long(), minlength=nc) + 1.
         m = self.model[-1]  # Detect() module
@@ -334,22 +699,91 @@ Model = DetectionModel  # retain YOLOv5 'Model' class for backwards compatibilit
 class SegmentationModel(DetectionModel):
     # YOLOv5 segmentation model
     def __init__(self, cfg="yolov5s-seg.yaml", ch=3, nc=None, anchors=None):
-        """Initializes a YOLOv5 segmentation model with configurable params: cfg (str) for configuration, ch (int) for channels, nc (int) for num classes, anchors (list)."""
+        """
+        Initializes a YOLOv5 segmentation model with configurable parameters.
+        
+        Args:
+            cfg (str): Path to the configuration file containing model architecture and parameters. Defaults to "yolov5s-seg.yaml".
+            ch (int): Number of input channels. Defaults to 3.
+            nc (int | None): Number of classes. If provided, overrides the number of classes specified in the cfg file.
+            anchors (list | None): List of anchor points. If provided, overrides the anchor configuration in the cfg file.
+        
+        Returns:
+            (None): Initializes various components of the SegmentationModel class.
+        
+        Example:
+            ```python
+            from ultralytics import SegmentationModel
+            model = SegmentationModel()
+            ```
+        
+        Note:
+            The initialization includes setting up model layers, anchors, and other configurations based on the provided
+            or default configuration file.
+        """
         super().__init__(cfg, ch, nc, anchors)
 
 
 class ClassificationModel(BaseModel):
     # YOLOv5 classification model
     def __init__(self, cfg=None, model=None, nc=1000, cutoff=10):
-        """Initializes YOLOv5 model with config file `cfg`, input channels `ch`, number of classes `nc`, and `cuttoff`
-        index.
+        """
+        Initializes a YOLOv5 classification model with either a configuration file or a pre-built model, specifying
+        the number of classes and a cutoff layer index.
+        
+        Args:
+            cfg (str | None): Path to the model configuration file, or None if using `model`.
+            model (torch.nn.Module | None): Pre-built torch model, or None if using `cfg`.
+            nc (int): Number of output classes, default is 1000.
+            cutoff (int): Index of the cutoff layer, default is 10.
+        
+        Returns:
+            None
+        
+        Example:
+            ```python
+            # Initializing from a configuration file
+            model = ClassificationModel(cfg='yolov5-class-config.yaml', nc=1000, cutoff=10)
+        
+            # Initializing from an existing model
+            model = ClassificationModel(model=prebuilt_model, nc=1000, cutoff=10)
+            ```
+        
+        Note:
+            This model can be extended or customized by modifying the configuration file or the pre-built model.
         """
         super().__init__()
         self._from_detection_model(model, nc, cutoff) if model is not None else self._from_yaml(cfg)
 
     def _from_detection_model(self, model, nc=1000, cutoff=10):
-        """Creates a classification model from a YOLOv5 detection model, slicing at `cutoff` and adding a classification
-        layer.
+        """
+        Perform a transformation from a YOLOv5 detection model to a classification model.
+        
+        Args:
+            model (DetectionModel): A pre-trained YOLOv5 detection model.
+            nc (int): Number of classes for the classification model. Default is 1000.
+            cutoff (int): Index to slice the model's layers up to the classification layer. Default is 10.
+        
+        Returns:
+            None. The function modifies the model in place.
+        
+        Notes:
+            This function takes a detection model and transforms it into a classification model by slicing the model layers 
+            at the specified cutoff point and adding a classification layer with the specified number of classes.
+            - If the input model is wrapped by `DetectMultiBackend`, it unwraps the model to get the underlying YOLOv5 model.
+            - Constructs a `Classify` layer, replacing the final detection layer with this new classification layer.
+        
+        Example:
+            ```python
+            from ultralytics import YOLOv5
+        
+            # Load a pre-trained detection model
+            detection_model = YOLOv5.load('yolov5s.pt')
+        
+            # Create a classification model from detection model
+            classification_model = YOLOv5.ClassificationModel()
+            classification_model._from_detection_model(detection_model, nc=1000, cutoff=10)
+            ```
         """
         if isinstance(model, DetectMultiBackend):
             model = model.model  # unwrap DetectMultiBackend
@@ -365,12 +799,49 @@ class ClassificationModel(BaseModel):
         self.nc = nc
 
     def _from_yaml(self, cfg):
-        """Creates a YOLOv5 classification model from a specified *.yaml configuration file."""
+        """
+        Perform initialization and parsing from a YOLOv5 configuration file.
+        
+        Args:
+            cfg (str): Path to the YOLOv5 YAML configuration file.
+        
+        Returns:
+            None. The function modifies the model in place utilizing the defined configuration parameters.
+        
+        Notes:
+            This function reads a YOLOv5 YAML configuration file and constructs the classification model accordingly. It sets the 
+            appropriate channels, layers, and output classes based on the parsed configuration data.
+        """
         self.model = None
 
 
 def parse_model(d, ch):
-    """Parses a YOLOv5 model from a dict `d`, configuring layers based on input channels `ch` and model architecture."""
+    """
+    Parses YOLOv5 model architecture from a configuration dictionary and initializes its layers.
+    
+    Args:
+        d (dict): Dictionary containing model configuration. Must include keys: "anchors", "nc", "depth_multiple", 
+            "width_multiple", and optionally "activation" and "channel_multiple".
+        ch (list[int]): List of input channels for each layer.
+    
+    Returns:
+        (tuple[nn.Sequential, list[int]]): A tuple containing:
+            - `model` (nn.Sequential): The constructed YOLOv5 model based on the configuration.
+            - `save` (list[int]): List of layers whose outputs should be preserved during the forward pass.
+    
+    Example:
+        ```python
+        from pathlib import Path
+        import yaml
+    
+        # Load model configuration YAML
+        with open(Path('yolov5s.yaml'), 'r') as file:
+            model_config = yaml.safe_load(file)
+    
+        # Parse model and initialize
+        model, save = parse_model(model_config, ch=[3])
+        ```
+    """
     LOGGER.info(f"\n{'':>3}{'from':>18}{'n':>3}{'params':>10}  {'module':<40}{'arguments':<30}")
     anchors, nc, gd, gw, act, ch_mul = (
         d["anchors"],