Refactor code for speed and clarity

2024-07-17 22:26:29 +02:00 · 2024-07-17 22:26:29 +02:00 · 90cd326b3f
parent 8003649c79
commit 90cd326b3f
4 changed files with 3260 additions and 248 deletions
--- a/models/common.py
+++ b/models/common.py
--- a/models/experimental.py
+++ b/models/experimental.py
@ -14,8 +14,19 @@ class Sum(nn.Module):
    """Weighted sum of 2 or more layers https://arxiv.org/abs/1911.09070."""

    def __init__(self, n, weight=False):
-        """Initializes a module to sum outputs of layers with number of inputs `n` and optional weighting, supporting 2+
-        inputs.
+        """
+        Initialize the Sum module to aggregate outputs from multiple layers, optionally with weights.
+        
+        Args:
+            n (int): Number of layers to sum. Must be 2 or more.
+            weight (bool): If True, applies weights to the inputs before summing.
+        
+        Returns:
+            None
+        
+        Notes:
+            Refer to "Weighted sum of 2 or more layers" at https://arxiv.org/abs/1911.09070 for detailed insights 
+            and usage scenarios.
        """
        super().__init__()
        self.weight = weight  # apply weights boolean
@ -24,7 +35,26 @@ class Sum(nn.Module):
            self.w = nn.Parameter(-torch.arange(1.0, n) / 2, requires_grad=True)  # layer weights

    def forward(self, x):
-        """Processes input through a customizable weighted sum of `n` inputs, optionally applying learned weights."""
+        """
+        Compute a weighted or unweighted sum of input tensors.
+        
+        Args:
+            x (list[torch.Tensor]): List of input tensors to be summed, with each tensor having the same shape (N, D).
+        
+        Returns:
+            (torch.Tensor): The resulting tensor after summing the input tensors, maintaining the same shape (N, D).
+        
+        Example:
+            ```python
+            sum_layer = Sum(n=3, weight=False)
+            inputs = [torch.rand(1, 10), torch.rand(1, 10), torch.rand(1, 10)]
+            result = sum_layer.forward(inputs)
+            ```
+        
+        Note:
+            If `weight` is set to True when initializing the class, weights will be applied to the inputs before summing.
+            For more information, refer to "Weighted sum of 2 or more layers" at https://arxiv.org/abs/1911.09070.
+        """
        y = x[0]  # no weight
        if self.weight:
            w = torch.sigmoid(self.w) * 2
@ -40,8 +70,29 @@ class MixConv2d(nn.Module):
    """Mixed Depth-wise Conv https://arxiv.org/abs/1907.09595."""

    def __init__(self, c1, c2, k=(1, 3), s=1, equal_ch=True):
-        """Initializes MixConv2d with mixed depth-wise convolutional layers, taking input and output channels (c1, c2),
-        kernel sizes (k), stride (s), and channel distribution strategy (equal_ch).
+        """
+        Initialize the MixConv2d module, handling mixed depth-wise convolutional operations.
+        
+        Args:
+            c1 (int): Number of input channels (C1).
+            c2 (int): Number of output channels (C2).
+            k (tuple[int]): Kernel sizes for the convolutional layers.
+            s (int): Stride value for the convolutional layers.
+            equal_ch (bool): Flag to determine if channels are distributed equally. True for equal channels per group, False
+                for equal weight.numel() per group.
+        
+        Example:
+            ```python
+            mixconv = MixConv2d(c1=32, c2=64, k=(1, 3, 5), s=1, equal_ch=True)
+            output = mixconv(input_tensor)
+            ```
+        
+        Note:
+            The `MixConv2d` layer applies multiple depth-wise convolutions with different kernel sizes in parallel, which 
+            can capture multi-scale features within a single layer. This technique is particularly useful for improving 
+            spatial feature extraction and reducing model complexity.
+        
+            Further reading: https://arxiv.org/abs/1907.09595
        """
        super().__init__()
        n = len(k)  # number of convolutions
@ -63,8 +114,24 @@ class MixConv2d(nn.Module):
        self.act = nn.SiLU()

    def forward(self, x):
-        """Performs forward pass by applying SiLU activation on batch-normalized concatenated convolutional layer
-        outputs.
+        """
+        Perform forward pass by applying mixed depth-wise convolutions followed by batch normalization and SiLU activation.
+        
+        Args:
+            x (torch.Tensor): Input tensor with shape (N, C, H, W) where N is the batch size, C is the number of channels,
+                H is the height, and W is the width.
+        
+        Returns:
+            (torch.Tensor): Output tensor after applying mixed convolutions, batch normalization, and SiLU activation,
+                maintaining the shape (N, C', H', W') where C' is the output channels based on the convolutional layer
+                configuration.
+        
+        Example:
+            ```python
+            mixconv = MixConv2d(c1=32, c2=64, k=(1, 3), s=1)
+            x = torch.randn(16, 32, 128, 128)
+            output = mixconv(x)
+            ```
        """
        return self.act(self.bn(torch.cat([m(x) for m in self.m], 1)))

@ -73,11 +140,51 @@ class Ensemble(nn.ModuleList):
    """Ensemble of models."""

    def __init__(self):
-        """Initializes an ensemble of models to be used for aggregated predictions."""
+        """
+        Initializes an ensemble of models for combined inference and aggregated predictions.
+        
+        Example:
+            ```python
+            ensemble = Ensemble()
+            model1 = MyModel1()
+            model2 = MyModel2()
+            ensemble.append(model1)
+            ensemble.append(model2)
+            ```
+        """
        super().__init__()

    def forward(self, x, augment=False, profile=False, visualize=False):
-        """Performs forward pass aggregating outputs from an ensemble of models.."""
+        """
+        Aggregates outputs from multiple models in the ensemble by concatenating them during the forward pass.
+        
+        Args:
+            x (torch.Tensor): Input tensor with shape (N, C, H, W) where N is the batch size, C is the number of channels,
+                H is the height, and W is the width.
+            augment (bool): Flag to apply test-time augmentation (TTA) during inference. Default is False.
+            profile (bool): If True, enables profiling of the forward pass. Default is False.
+            visualize (bool): If True, enables visualization of model predictions. Default is False.
+        
+        Returns:
+            (torch.Tensor): Aggregated output tensor from the ensemble models, with shape dependent on the number of models
+                and their architectures.
+        
+        Example:
+            ```python
+            from ultralytics import Ensemble
+            import torch
+        
+            # Initialize the ensemble
+            ensemble = Ensemble()
+            # Assume models are already added to the ensemble
+        
+            # Create a dummy input tensor
+            x = torch.randn(8, 3, 640, 640)  # Example input for 8 images of 3 channels and 640x640 resolution
+        
+            # Perform forward pass
+            output = ensemble.forward(x, augment=False, profile=False, visualize=False)
+            ```
+        """
        y = [module(x, augment, profile, visualize)[0] for module in self]
        # y = torch.stack(y).max(0)[0]  # max ensemble
        # y = torch.stack(y).mean(0)  # mean ensemble
@ -87,9 +194,32 @@ class Ensemble(nn.ModuleList):

 def attempt_load(weights, device=None, inplace=True, fuse=True):
    """
-    Loads and fuses an ensemble or single YOLOv5 model from weights, handling device placement and model adjustments.
-
-    Example inputs: weights=[a,b,c] or a single model weights=[a] or weights=a.
+    Loads and fuses a YOLOv5 model or an ensemble of models from provided weights, adjusting device placement and model 
+    attributes for optimal performance.
+    
+    Args:
+        weights (str | list[str]): Path(s) to model weight file(s). It can be a single path or a list of paths.
+        device (torch.device | None, optional): Device to load the model on. If None, loads on CPU by default.
+        inplace (bool, optional): If True, enables inplace operations in certain layers like activation layers. 
+            Defaults to True.
+        fuse (bool, optional): Whether to fuse Conv2d + BatchNorm2d layers for speedup during inference. Defaults to True.
+    
+    Returns:
+        (torch.nn.Module): Loaded YOLOv5 model or an ensemble of models loaded onto the specified device.
+    
+    Example:
+        ```python
+        # Load a single model weight
+        model = attempt_load('yolov5s.pt')
+    
+        # Load an ensemble of models
+        model = attempt_load(['yolov5s.pt', 'yolov5m.pt', 'yolov5l.pt'])
+        ```
+    
+    Note:
+        - This function ensures compatibility and performance optimization by adjusting attributes and configurations of the
+          loaded model(s).
+        - If `fuse` is set to True, it will fuse Conv2d and BatchNorm2d layers within the model(s) to speed up inference.
    """
    from models.yolo import Detect, Model

--- a/models/tf.py
+++ b/models/tf.py
--- a/models/yolo.py
+++ b/models/yolo.py
@ -76,7 +76,27 @@ class Detect(nn.Module):
    export = False  # export mode

    def __init__(self, nc=80, anchors=(), ch=(), inplace=True):
-        """Initializes YOLOv5 detection layer with specified classes, anchors, channels, and inplace operations."""
+        """
+        Initializes the YOLOv5 Detect layer with class count, anchors, channels, and inplace operations.
+        
+        Args:
+            nc (int, optional): Number of classes. Default is 80.
+            anchors (tuple, optional): Anchor box dimensions, typically specified for each detection layer. Default is ().
+            ch (tuple, optional): Number of input channels for each detection layer. Default is ().
+            inplace (bool, optional): If True, operations are done inplace. Default is True.
+        
+        Returns:
+            None
+        
+        Example:
+            ```python
+            detect_layer = Detect(nc=80, anchors=(), ch=(256, 512, 1024), inplace=True)
+            ```
+        
+        Note:
+            This function initializes detection heads in the YOLOv5 model, setting up convolution layers, grids, and 
+            anchor grids required for object detection inference.
+        """
        super().__init__()
        self.nc = nc  # number of classes
        self.no = nc + 5  # number of outputs per anchor
@ -89,7 +109,23 @@ class Detect(nn.Module):
        self.inplace = inplace  # use inplace ops (e.g. slice assignment)

    def forward(self, x):
-        """Processes input through YOLOv5 layers, altering shape for detection: `x(bs, 3, ny, nx, 85)`."""
+        """
+        Processes input through detection layers, reshaping and applying convolution for YOLOv5 inference.
+        
+        Args:
+            x (list[torch.Tensor]): List of feature maps from backbone with shape (B, C, H, W) where B is the batch 
+                size, C is the number of channels, and H and W are height and width.
+        
+        Returns:
+            (list[torch.Tensor]): List of processed detections, each a torch Tensor with shape (B, N, D) where B 
+                is the batch size, N is the number of detections, and D is the dimensions of each detection 
+                (e.g., bounding box coordinates, objectness score, class probabilities).
+        
+        Note:
+            This method applies a series of convolutions to transform the input feature maps into detection 
+            outputs. It also handles reshaping and permutation to align with YOLOv5's output format. During 
+            inference, additional steps are performed to compute final object locations and dimensions.
+        """
        z = []  # inference output
        for i in range(self.nl):
            x[i] = self.m[i](x[i])  # conv
@ -115,7 +151,29 @@ class Detect(nn.Module):
        return x if self.training else (torch.cat(z, 1),) if self.export else (torch.cat(z, 1), x)

    def _make_grid(self, nx=20, ny=20, i=0, torch_1_10=check_version(torch.__version__, "1.10.0")):
-        """Generates a mesh grid for anchor boxes with optional compatibility for torch versions < 1.10."""
+        """
+        Generate a mesh grid for anchor boxes with torch version compatibility for detection models.
+        
+        Args:
+            nx (int): Number of grid cells along the x-axis.
+            ny (int): Number of grid cells along the y-axis.
+            i (int): Index of the detection layer for which the grid is being generated.
+            torch_1_10 (bool): Indicator whether the torch version is at least 1.10.0 for meshgrid compatibility.
+        
+        Returns:
+            (tuple[torch.Tensor, torch.Tensor]): A tuple containing two tensors:
+                - grid (torch.Tensor): The generated grid with shape (1, num_anchors, ny, nx, 2), containing xy coordinates.
+                - anchor_grid (torch.Tensor): The anchor grid scaled by the stride, with shape (1, num_anchors, ny, nx, 2).
+        
+        Example:
+            ```python
+            detector = Detect()
+            grid, anchor_grid = detector._make_grid(20, 20, 0)
+            ```
+        
+        Note:
+            The function ensures compatibility with different torch versions by using appropriate meshgrid indexing options.
+        """
        d = self.anchors[i].device
        t = self.anchors[i].dtype
        shape = 1, self.na, ny, nx, 2  # grid shape
@ -129,7 +187,25 @@ class Detect(nn.Module):
 class Segment(Detect):
    # YOLOv5 Segment head for segmentation models
    def __init__(self, nc=80, anchors=(), nm=32, npr=256, ch=(), inplace=True):
-        """Initializes YOLOv5 Segment head with options for mask count, protos, and channel adjustments."""
+        """
+        Initializes YOLOv5 Segment head with parameters for masks, prototypes, class count, anchors, and channels.
+        
+        Args:
+            nc (int): Number of classes for the segmentation model (default is 80).
+            anchors (tuple): Tuple of anchor box dimensions for the segmentation model.
+            nm (int): Number of masks for the segmentation (default is 32).
+            npr (int): Number of prototypes for the masks (default is 256).
+            ch (tuple): Tuple of input channels for each detection layer.
+            inplace (bool): If True, use in-place operations for layer computations (default is True).
+        
+        Returns:
+            None
+        
+        Example:
+            ```python
+            segment_head = Segment(nc=80, anchors=anchors, nm=32, npr=256, ch=[512, 256, 128], inplace=True)
+            ```
+        """
        super().__init__(nc, anchors, ch, inplace)
        self.nm = nm  # number of masks
        self.npr = npr  # number of protos
@ -139,8 +215,38 @@ class Segment(Detect):
        self.detect = Detect.forward

    def forward(self, x):
-        """Processes input through the network, returning detections and prototypes; adjusts output based on
-        training/export mode.
+        """
+        Processes input through the network, returning detections and prototypes.
+        
+        Args:
+            x (list[torch.Tensor]): List of input tensors corresponding to different detection layers, each with shape
+                (B, C, H, W), where B is batch size, C is number of channels, H and W are height and width.
+        
+        Returns:
+            (tuple[torch.Tensor, torch.Tensor]): A tuple containing:
+                - `detection` (torch.Tensor): The detection output tensor with shape (B, N, 85), where B is batch size, N is
+                  the number of detections.
+                - `prototypes` (torch.Tensor): The prototype masks tensor produced by the network with shape (B, P, H', W'),
+                  where B is batch size, P is the number of prototypes, and H' and W' correspond to height and width.
+        
+         Example:
+            ```python
+            import torch
+            from ultralytics import YOLOv5
+        
+            # Initialize model
+            model = YOLOv5.Segment()
+        
+            # Generate dummy input
+            x = [torch.randn(1, 3, 640, 640) for _ in range(3)]
+        
+            # Forward pass
+            detection, prototypes = model.forward(x)
+            ```
+        
+        Note:
+            During inference (evaluation mode), detection outputs are post-processed to generate final bounding boxes and classes.
+            In training mode, the outputs are not processed.
        """
        p = self.proto(x[0])
        x = self.detect(self, x)
@ -151,13 +257,64 @@ class BaseModel(nn.Module):
    """YOLOv5 base model."""

    def forward(self, x, profile=False, visualize=False):
-        """Executes a single-scale inference or training pass on the YOLOv5 base model, with options for profiling and
-        visualization.
+        """
+        Perform a forward pass through the YOLOv5 model, optionally profiling and visualizing features.
+        
+        Args:
+            x (torch.Tensor): Input data tensor with shape (N, C, H, W).
+            profile (bool): Whether to profile execution time of each layer. Defaults to False.
+            visualize (bool): Whether to store and visualize feature maps. Defaults to False.
+        
+        Returns:
+            (torch.Tensor | tuple): In training mode, returns predictions as tuples with shapes (N, 3, H, W, no). 
+            In inference mode, returns a single tensor with shape (N, M, no), where M is the number of predicted 
+            objects after non-maximum suppression (NMS).
+        
+        Example:
+            ```python
+            model = BaseModel()
+            input_tensor = torch.randn(1, 3, 640, 640)
+            output = model.forward(input_tensor, profile=True, visualize=True)
+            ```
+        
+        Note:
+            - In training mode, the method returns unprocessed predictions for each scale, suitable for loss calculation.
+            - In inference mode, non-maximum suppression is applied to refine predictions.
        """
        return self._forward_once(x, profile, visualize)  # single-scale inference, train

    def _forward_once(self, x, profile=False, visualize=False):
-        """Performs a forward pass on the YOLOv5 model, enabling profiling and feature visualization options."""
+        """
+        Execute a forward pass through the YOLOv5 model layers with optional profiling and visualization.
+        
+        Args:
+            x (torch.Tensor): Input tensor with shape (N, C, H, W), where N is the batch size, C is the number 
+                of channels, and H and W are the height and width of the input image, respectively.
+            profile (bool): If True, profiles the execution time for each layer. Defaults to False.
+            visualize (bool): If True, stores and visualizes feature maps. Defaults to False.
+        
+        Returns:
+            (torch.Tensor): Model output tensor with shape depending on whether the model is in training or 
+            inference mode.
+                - In training mode: Returns a list of tensors for each detection layer, each tensor has shape 
+                  (N, 3, H, W, no), where `no` is the number of outputs per anchor.
+                - In inference mode: If not exporting, returns a tuple with a single tensor of shape (N, M, no), 
+                  where M is the number of predicted objects.
+                - If exporting: Returns a tensor of shape (N, M, no).
+        
+        Example:
+            ```python
+            model = BaseModel()
+            input_tensor = torch.randn(1, 3, 640, 640)  # Generate a random input tensor
+            output = model._forward_once(input_tensor, profile=True, visualize=True)
+            ```
+        
+        Note:
+            This method conducts a single-scale inference or training pass through the model. Depending on the mode 
+            (training or inference), the method behaves differently. In training mode, it returns unprocessed 
+            predictions for each detection layer. In inference mode, non-maximum suppression (NMS) is typically 
+            applied after this method to refine predictions.
+        """
        y, dt = [], []  # outputs
        for m in self.model:
            if m.f != -1:  # if not from previous layer
@ -171,7 +328,32 @@ class BaseModel(nn.Module):
        return x

    def _profile_one_layer(self, m, x, dt):
-        """Profiles a single layer's performance by computing GFLOPs, execution time, and parameters."""
+        """
+        Profiles a single model layer's GFLOPs, parameters, and execution time within the YOLOv5 model.
+        
+        Args:
+            m (nn.Module): The model layer to be profiled.
+            x (torch.Tensor): Input tensor passed to the model layer, with shape (N, C, H, W).
+            dt (list[float]): List to record execution times of the profiled layer.
+        
+        Returns:
+            None: The function updates the `dt` list with the execution time of the layer in milliseconds.
+        
+        Example:
+            ```python
+            model = BaseModel()
+            layer = nn.Conv2d(3, 16, 3, 1)  # Example layer
+            input_tensor = torch.randn(1, 3, 640, 640)  # Example input
+            execution_times = []
+            
+            model._profile_one_layer(layer, input_tensor, execution_times)
+            ```
+        
+        Note:
+            - Profiling is done for the purpose of understanding the computational load (GFLOPs) and time taken per layer within
+              the YOLOv5 model.
+            - If the `thop` library is not available, FLOPs computation will not be performed.
+        """
        c = m == self.model[-1]  # is final layer, copy input as inplace fix
        o = thop.profile(m, inputs=(x.copy() if c else x,), verbose=False)[0] / 1e9 * 2 if thop else 0  # FLOPs
        t = time_sync()
@ -185,7 +367,25 @@ class BaseModel(nn.Module):
            LOGGER.info(f"{sum(dt):10.2f} {'-':>10s} {'-':>10s}  Total")

    def fuse(self):
-        """Fuses Conv2d() and BatchNorm2d() layers in the model to improve inference speed."""
+        """
+        Fuses Conv2d and BatchNorm2d layers in the model to optimize inference speed.
+        
+        This method modifies the model in place by merging Conv2d and BatchNorm2d layers into single Conv2d
+        layers where applicable. This can significantly improve inference speed and reduce memory usage.
+        
+        Returns:
+            None
+        
+        Example:
+            ```python
+            model = BaseModel()
+            model.fuse()
+            ```
+        
+        Note:
+            After fusing layers, the forward method of fused layers is updated to `forward_fuse`, optimizing 
+            the execution path.
+        """
        LOGGER.info("Fusing layers... ")
        for m in self.model.modules():
            if isinstance(m, (Conv, DWConv)) and hasattr(m, "bn"):
@ -196,12 +396,44 @@ class BaseModel(nn.Module):
        return self

    def info(self, verbose=False, img_size=640):
-        """Prints model information given verbosity and image size, e.g., `info(verbose=True, img_size=640)`."""
+        """
+        Display model summary, including layer details and computational complexity for a specified image size.
+        
+        Args:
+            verbose (bool): If True, prints a detailed summary including information about each layer. Defaults to False.
+            img_size (int | tuple[int]): Size of the input image as an integer (for square images) or tuple (H, W). 
+                Defaults to 640.
+        
+        Returns:
+            (None): This function does not return any value. It directly prints the model summary to the console.
+        
+        Example:
+            ```python
+            model = BaseModel()
+            model.info(verbose=True, img_size=640)
+            ```
+        
+        Note:
+            Ensure that the `verbose` parameter is set to True for a comprehensive layer-by-layer summary. The image size should
+            be supplied based on the expected input size for the model.
+        """
        model_info(self, verbose, img_size)

    def _apply(self, fn):
-        """Applies transformations like to(), cpu(), cuda(), half() to model tensors excluding parameters or registered
-        buffers.
+        """
+        Apply a function to the model and its layer parameters, including specific modifications for Detect and Segment layers.
+        
+        Args:
+            fn (function): A function to apply to the model's tensors.
+        
+        Returns:
+            (torch.nn.Module): The module with applied transformations.
+        
+        Note:
+            The function is particularly useful for operations like converting tensors to a target device
+            (e.g., CUDA, CPU) or altering their precision (e.g., float16). The Detect layer's stride and grid
+            parameters, as well as the Segment layer's anchor grids, are specifically modified to ensure consistency
+            after such transformations.
        """
        self = super()._apply(fn)
        m = self.model[-1]  # Detect()
@ -216,7 +448,36 @@ class BaseModel(nn.Module):
 class DetectionModel(BaseModel):
    # YOLOv5 detection model
    def __init__(self, cfg="yolov5s.yaml", ch=3, nc=None, anchors=None):
-        """Initializes YOLOv5 model with configuration file, input channels, number of classes, and custom anchors."""
+        """
+        Initializes YOLOv5 model using the specified config, input channels, class count, and custom anchors.
+        
+        Args:
+            cfg (str | dict): Model configuration, either a path to a YAML config file or a configuration dictionary.
+            ch (int): Number of input channels. Defaults to 3.
+            nc (int | None): Number of classes. If provided, overrides the value in the YAML file/config dictionary. Defaults to None.
+            anchors (list[int] | None): Custom anchors. If provided, overrides the anchors defined in the YAML file/config
+                dictionary. Defaults to None.
+        
+        Returns:
+            None
+        
+        Example:
+            ```python
+            from ultralytics.models.yolo import DetectionModel
+        
+            # Initialize model with path to YAML config
+            model1 = DetectionModel(cfg="yolov5s.yaml")
+        
+            # Initialize model with configuration dictionary
+            cfg_dict = {"nc": 80, "depth_multiple": 0.33, "width_multiple": 0.50}
+            model2 = DetectionModel(cfg=cfg_dict, ch=3, nc=80)
+            ```
+        
+        Note:
+            If `cfg` is a dictionary, it should include the necessary parameters such as `nc`, `depth_multiple`, and `width_multiple`.
+            During initialization, the model configuration from the YAML file or dictionary is parsed, and the internal model
+            structure is built accordingly. This includes defining the detection layers and adjusting anchors and strides.
+        """
        super().__init__()
        if isinstance(cfg, dict):
            self.yaml = cfg  # model dict
@ -261,13 +522,64 @@ class DetectionModel(BaseModel):
        LOGGER.info("")

    def forward(self, x, augment=False, profile=False, visualize=False):
-        """Performs single-scale or augmented inference and may include profiling or visualization."""
+        """
+        Perform forward pass through the YOLOv5 detection model for training or inference, with options for augmentation,
+        profiling, and visualization.
+        
+        Args:
+            x (torch.Tensor): Input tensor with a shape of (N, C, H, W), where N is the batch size, C is the number of channels,
+                H is the height, and W is the width.
+            augment (bool): If True, performs augmented inference. Defaults to False.
+            profile (bool): If True, profiles the execution time of each layer. Defaults to False.
+            visualize (bool): If True, stores and visualizes feature maps. Defaults to False.
+        
+        Returns:
+            (torch.Tensor | tuple): Depending on the mode, returns either:
+                - In training mode: tuple containing predictions for each scale with shapes (N, 3, H, W, no).
+                - In inference mode: tensor with shape (N, M, no), where M is the number of predicted objects after
+                  non-maximum suppression.
+                - When exporting: tuple containing concatenated inference output tensor and intermediate feature maps.
+        
+        Example:
+            ```python
+            model = DetectionModel(cfg="yolov5s.yaml", ch=3, nc=80)
+            input_tensor = torch.randn(1, 3, 640, 640)
+            output = model.forward(input_tensor, augment=False, profile=True, visualize=False)
+            ```
+        
+        Note:
+            This method adapts to training and inference modes, with different return types based on the operational mode.
+            During training mode, it returns raw predictions across various scales for loss calculation, whereas in inference
+            mode, non-maximum suppression (NMS) is applied to refine predictions.
+        """
        if augment:
            return self._forward_augment(x)  # augmented inference, None
        return self._forward_once(x, profile, visualize)  # single-scale inference, train

    def _forward_augment(self, x):
-        """Performs augmented inference across different scales and flips, returning combined detections."""
+        """
+        Performs augmented inference by processing input across different scales and flips, merging the outputs.
+        
+        Args:
+            x (torch.Tensor): Input tensor with shape (N, C, H, W), where N is batch size, C is number of channels, 
+                H and W are height and width.
+        
+        Returns:
+            (torch.Tensor): Merged output tensor after multi-scale and flip augmentations, with shape (N, M, no), 
+                where N is batch size, M is the number of predictions, and no is the number of output features.
+        
+        Example:
+            ```python
+            model = DetectionModel(cfg='yolov5s.yaml')
+            input_tensor = torch.randn(1, 3, 640, 640)
+            output = model._forward_augment(input_tensor)
+            ```
+        
+        Note:
+            The function processes the input using different scales (1, 0.83, 0.67) and flips (None, horizontal), 
+            descaling predictions before merging. This helps to improve model robustness and accuracy 
+            during inference.
+        """
        img_size = x.shape[-2:]  # height, width
        s = [1, 0.83, 0.67]  # scales
        f = [None, 3, None]  # flips (2-ud, 3-lr)
@ -282,7 +594,23 @@ class DetectionModel(BaseModel):
        return torch.cat(y, 1), None  # augmented inference, train

    def _descale_pred(self, p, flips, scale, img_size):
-        """De-scales predictions from augmented inference, adjusting for flips and image size."""
+        """
+        Adjusts predictions for augmented inference by de-scaling and correcting for flips or image size changes.
+        
+        Args:
+            p (torch.Tensor): Predictions tensor with shape (..., N) where N indicates prediction attributes like
+                bounding box coordinates, confidence score, etc.
+            flips (int | None): Specifies flip mode. `2` for vertical flip, `3` for horizontal flip, and `None` for no flip.
+            scale (float): Scale factor used during augmentation.
+            img_size (tuple[int, int]): Original image dimensions as (height, width).
+        
+        Returns:
+            (torch.Tensor): Adjusted predictions tensor with the same shape as input, de-scaled and de-flipped appropriately.
+        
+        Note:
+            If inplace operations are enabled, the adjustments are applied directly on the tensor. Otherwise, new tensors are
+            created for the adjusted values to avoid modifying the original input.
+        """
        if self.inplace:
            p[..., :4] /= scale  # de-scale
            if flips == 2:
@ -299,8 +627,18 @@ class DetectionModel(BaseModel):
        return p

    def _clip_augmented(self, y):
-        """Clips augmented inference tails for YOLOv5 models, affecting first and last tensors based on grid points and
-        layer counts.
+        """
+        Clip augmented inference tails for YOLOv5 models, adjusting predictions from the first and last layers.
+        
+        Args:
+            y (list[torch.Tensor]): List of tensors, where each tensor represents detections from augmented inference across different layers.
+        
+        Returns:
+            (list[torch.Tensor]): Modified list of tensors with clipped augmented inference tails.
+        
+        Notes:
+            This function helps to discard the augmented tails by adjusting predictions from the first and last layers,
+            which might otherwise introduce artifacts due to the augmentation process.
        """
        nl = self.model[-1].nl  # number of detection layers (P3-P5)
        g = sum(4**x for x in range(nl))  # grid points
@ -313,9 +651,36 @@ class DetectionModel(BaseModel):

    def _initialize_biases(self, cf=None):
        """
-        Initializes biases for YOLOv5's Detect() module, optionally using class frequencies (cf).
-
-        For details see https://arxiv.org/abs/1708.02002 section 3.3.
+        Initialize biases for the YOLOv5 Detect module using specified or default bias adjustments.
+        
+        Args:
+            cf (torch.Tensor | None): Optional tensor representing class frequencies for bias initialization. The shape should be 
+                (N,), where N is the number of classes. If not provided, default adjustments are applied based on the number of 
+                classes and image dimensions.
+        
+        Returns:
+            (torch.Tensor): Updated biases for the model with shape (N, M), where N is the number of anchors and M is the number of 
+                outputs per anchor.
+        
+        Note:
+            The function calculates the biases based on principles from https://arxiv.org/abs/1708.02002, section 3.3. If class 
+            frequencies (`cf`) are not provided, default bias adjustments are made. Adjustments primarily ensure that objectness and 
+            class biases are reasonably initialized for effective training.
+        
+        Example:
+            ```python
+            from ultralytics.yolov5 import DetectionModel
+            import torch
+        
+            # Initialize model
+            model = DetectionModel(cfg="yolov5s.yaml")
+        
+            # Optional class frequencies tensor
+            class_frequencies = torch.tensor([100, 150, 200])
+        
+            # Initialize biases
+            model._initialize_biases(cf=class_frequencies)
+            ```
        """
        # cf = torch.bincount(torch.tensor(np.concatenate(dataset.labels, 0)[:, 0]).long(), minlength=nc) + 1.
        m = self.model[-1]  # Detect() module
@ -334,22 +699,91 @@ Model = DetectionModel  # retain YOLOv5 'Model' class for backwards compatibilit
 class SegmentationModel(DetectionModel):
    # YOLOv5 segmentation model
    def __init__(self, cfg="yolov5s-seg.yaml", ch=3, nc=None, anchors=None):
-        """Initializes a YOLOv5 segmentation model with configurable params: cfg (str) for configuration, ch (int) for channels, nc (int) for num classes, anchors (list)."""
+        """
+        Initializes a YOLOv5 segmentation model with configurable parameters.
+        
+        Args:
+            cfg (str): Path to the configuration file containing model architecture and parameters. Defaults to "yolov5s-seg.yaml".
+            ch (int): Number of input channels. Defaults to 3.
+            nc (int | None): Number of classes. If provided, overrides the number of classes specified in the cfg file.
+            anchors (list | None): List of anchor points. If provided, overrides the anchor configuration in the cfg file.
+        
+        Returns:
+            (None): Initializes various components of the SegmentationModel class.
+        
+        Example:
+            ```python
+            from ultralytics import SegmentationModel
+            model = SegmentationModel()
+            ```
+        
+        Note:
+            The initialization includes setting up model layers, anchors, and other configurations based on the provided
+            or default configuration file.
+        """
        super().__init__(cfg, ch, nc, anchors)


 class ClassificationModel(BaseModel):
    # YOLOv5 classification model
    def __init__(self, cfg=None, model=None, nc=1000, cutoff=10):
-        """Initializes YOLOv5 model with config file `cfg`, input channels `ch`, number of classes `nc`, and `cuttoff`
-        index.
+        """
+        Initializes a YOLOv5 classification model with either a configuration file or a pre-built model, specifying
+        the number of classes and a cutoff layer index.
+        
+        Args:
+            cfg (str | None): Path to the model configuration file, or None if using `model`.
+            model (torch.nn.Module | None): Pre-built torch model, or None if using `cfg`.
+            nc (int): Number of output classes, default is 1000.
+            cutoff (int): Index of the cutoff layer, default is 10.
+        
+        Returns:
+            None
+        
+        Example:
+            ```python
+            # Initializing from a configuration file
+            model = ClassificationModel(cfg='yolov5-class-config.yaml', nc=1000, cutoff=10)
+        
+            # Initializing from an existing model
+            model = ClassificationModel(model=prebuilt_model, nc=1000, cutoff=10)
+            ```
+        
+        Note:
+            This model can be extended or customized by modifying the configuration file or the pre-built model.
        """
        super().__init__()
        self._from_detection_model(model, nc, cutoff) if model is not None else self._from_yaml(cfg)

    def _from_detection_model(self, model, nc=1000, cutoff=10):
-        """Creates a classification model from a YOLOv5 detection model, slicing at `cutoff` and adding a classification
-        layer.
+        """
+        Perform a transformation from a YOLOv5 detection model to a classification model.
+        
+        Args:
+            model (DetectionModel): A pre-trained YOLOv5 detection model.
+            nc (int): Number of classes for the classification model. Default is 1000.
+            cutoff (int): Index to slice the model's layers up to the classification layer. Default is 10.
+        
+        Returns:
+            None. The function modifies the model in place.
+        
+        Notes:
+            This function takes a detection model and transforms it into a classification model by slicing the model layers 
+            at the specified cutoff point and adding a classification layer with the specified number of classes.
+            - If the input model is wrapped by `DetectMultiBackend`, it unwraps the model to get the underlying YOLOv5 model.
+            - Constructs a `Classify` layer, replacing the final detection layer with this new classification layer.
+        
+        Example:
+            ```python
+            from ultralytics import YOLOv5
+        
+            # Load a pre-trained detection model
+            detection_model = YOLOv5.load('yolov5s.pt')
+        
+            # Create a classification model from detection model
+            classification_model = YOLOv5.ClassificationModel()
+            classification_model._from_detection_model(detection_model, nc=1000, cutoff=10)
+            ```
        """
        if isinstance(model, DetectMultiBackend):
            model = model.model  # unwrap DetectMultiBackend
@ -365,12 +799,49 @@ class ClassificationModel(BaseModel):
        self.nc = nc

    def _from_yaml(self, cfg):
-        """Creates a YOLOv5 classification model from a specified *.yaml configuration file."""
+        """
+        Perform initialization and parsing from a YOLOv5 configuration file.
+        
+        Args:
+            cfg (str): Path to the YOLOv5 YAML configuration file.
+        
+        Returns:
+            None. The function modifies the model in place utilizing the defined configuration parameters.
+        
+        Notes:
+            This function reads a YOLOv5 YAML configuration file and constructs the classification model accordingly. It sets the 
+            appropriate channels, layers, and output classes based on the parsed configuration data.
+        """
        self.model = None


 def parse_model(d, ch):
-    """Parses a YOLOv5 model from a dict `d`, configuring layers based on input channels `ch` and model architecture."""
+    """
+    Parses YOLOv5 model architecture from a configuration dictionary and initializes its layers.
+    
+    Args:
+        d (dict): Dictionary containing model configuration. Must include keys: "anchors", "nc", "depth_multiple", 
+            "width_multiple", and optionally "activation" and "channel_multiple".
+        ch (list[int]): List of input channels for each layer.
+    
+    Returns:
+        (tuple[nn.Sequential, list[int]]): A tuple containing:
+            - `model` (nn.Sequential): The constructed YOLOv5 model based on the configuration.
+            - `save` (list[int]): List of layers whose outputs should be preserved during the forward pass.
+    
+    Example:
+        ```python
+        from pathlib import Path
+        import yaml
+    
+        # Load model configuration YAML
+        with open(Path('yolov5s.yaml'), 'r') as file:
+            model_config = yaml.safe_load(file)
+    
+        # Parse model and initialize
+        model, save = parse_model(model_config, ch=[3])
+        ```
+    """
    LOGGER.info(f"\n{'':>3}{'from':>18}{'n':>3}{'params':>10}  {'module':<40}{'arguments':<30}")
    anchors, nc, gd, gw, act, ch_mul = (
        d["anchors"],