yolov5/models/tf.py

1867 lines
80 KiB
Python

# Ultralytics YOLOv5 🚀, AGPL-3.0 license
"""
TensorFlow, Keras and TFLite versions of YOLOv5
Authored by https://github.com/zldrobit in PR https://github.com/ultralytics/yolov5/pull/1127
Usage:
$ python models/tf.py --weights yolov5s.pt
Export:
$ python export.py --weights yolov5s.pt --include saved_model pb tflite tfjs
"""
import argparse
import sys
from copy import deepcopy
from pathlib import Path
FILE = Path(__file__).resolve()
ROOT = FILE.parents[1] # YOLOv5 root directory
if str(ROOT) not in sys.path:
sys.path.append(str(ROOT)) # add ROOT to PATH
# ROOT = ROOT.relative_to(Path.cwd()) # relative
import numpy as np
import tensorflow as tf
import torch
import torch.nn as nn
from tensorflow import keras
from models.common import (
C3,
SPP,
SPPF,
Bottleneck,
BottleneckCSP,
C3x,
Concat,
Conv,
CrossConv,
DWConv,
DWConvTranspose2d,
Focus,
autopad,
)
from models.experimental import MixConv2d, attempt_load
from models.yolo import Detect, Segment
from utils.activations import SiLU
from utils.general import LOGGER, make_divisible, print_args
class TFBN(keras.layers.Layer):
# TensorFlow BatchNormalization wrapper
def __init__(self, w=None):
"""
Initialize a TensorFlow BatchNormalization layer using optional PyTorch pretrained weights.
Args:
w (torch.nn.Module | None): A PyTorch BatchNorm2d layer whose weights are used to initialize the TensorFlow
BatchNormalization layer. If `None`, the layer is initialized with default parameters.
Returns:
(None): This constructor does not return anything.
Example:
```python
import torch
from tensorflow.keras import layers
# PyTorch BatchNorm2d layer
torch_bn_layer = torch.nn.BatchNorm2d(32)
# TensorFlow BatchNormalization layer with weights from PyTorch layer
tf_bn_layer = TFBN(w=torch_bn_layer)
```
"""
super().__init__()
self.bn = keras.layers.BatchNormalization(
beta_initializer=keras.initializers.Constant(w.bias.numpy()),
gamma_initializer=keras.initializers.Constant(w.weight.numpy()),
moving_mean_initializer=keras.initializers.Constant(w.running_mean.numpy()),
moving_variance_initializer=keras.initializers.Constant(w.running_var.numpy()),
epsilon=w.eps,
)
def call(self, inputs):
"""
Apply batch normalization using TensorFlow's BatchNormalization layer.
Args:
inputs (torch.Tensor | np.ndarray | tf.Tensor): Input tensor to which batch normalization should be applied.
The input tensor must have a compatible shape (N, C, H, W) or (N, H, W, C) depending on the framework.
Returns:
(tf.Tensor): Tensor after applying batch normalization, with the same shape as the input tensor.
Example:
```python
import torch
from models.tf import TFBN
# Create dummy input tensor
input_tensor = torch.rand(1, 3, 64, 64)
# Initialize TFBN layer
tfbn = TFBN(w=None) # 'w' is typically a pretrained layer, None is for demo
# Apply batch normalization
output_tensor = tfbn.call(input_tensor)
```
"""
return self.bn(inputs)
class TFPad(keras.layers.Layer):
# Pad inputs in spatial dimensions 1 and 2
def __init__(self, pad):
"""
Initialize a padding layer for spatial dimensions with specified padding, supporting both int and tuple inputs.
Args:
pad (int | tuple[int, int]): Padding size. If an integer is provided, the same padding will be applied
to all sides. If a tuple, it should specify the (pad_height, pad_width).
Returns:
None
Example:
```python
# Example of initializing TFPad with integer padding
pad_layer = TFPad(2)
# Example of initializing TFPad with tuple padding
pad_layer = TFPad((2, 3))
```
Notes:
- This padding layer will only affect the spatial dimensions (height and width) of the input tensor.
- The `pad` parameter can either be a single integer or a tuple of two integers.
- The shape of the padding tensor is [batch, pad_height, pad_width, channels].
"""
super().__init__()
if isinstance(pad, int):
self.pad = tf.constant([[0, 0], [pad, pad], [pad, pad], [0, 0]])
else: # tuple/list
self.pad = tf.constant([[0, 0], [pad[0], pad[0]], [pad[1], pad[1]], [0, 0]])
def call(self, inputs):
"""
Apply zero-padding to the input tensor in the spatial dimensions 1 and 2 as specified.
Args:
inputs (tf.Tensor): Input tensor to be padded, with shape (N, H, W, C) where N is batch size, H is height,
W is width, and C is the number of channels.
Returns:
(tf.Tensor): Padded tensor with the same type as the input tensor, having the shape
[batch, padded_height, padded_width, channels], where padded_height and padded_width include
the applied padding in the respective dimensions.
Example:
```python
import tensorflow as tf
from tfpad import TFPad
# Initialize padding layer with integer padding
pad_layer = TFPad(2)
# Create a sample input tensor with shape [1, 3, 3, 1]
inputs = tf.ones([1, 3, 3, 1])
# Apply padding
padded_output = pad_layer.call(inputs)
print(padded_output.shape) # Output shape should be [1, 7, 7, 1] after padding
```
Notes:
- The padding is symmetric on each border of the height and width dimensions.
- The input tensor's shape is preserved except for the spatial dimensions where padding is applied.
"""
return tf.pad(inputs, self.pad, mode="constant", constant_values=0)
class TFConv(keras.layers.Layer):
# Standard convolution
def __init__(self, c1, c2, k=1, s=1, p=None, g=1, act=True, w=None):
"""
Initializes a standard convolution layer with optional batch normalization and activation; supports only
group=1.
Args:
c1 (int): Number of input channels.
c2 (int): Number of output channels.
k (int, optional): Kernel size. Defaults to 1.
s (int, optional): Stride size. Defaults to 1.
p (int | None, optional): Padding size. If None, it is automatically determined using `autopad` function.
Defaults to None.
g (int, optional): Number of groups. Only supports group=1. Defaults to 1.
act (bool, optional): Whether to include activation. Defaults to True.
w (torch.nn.Module | None, optional): Pretrained weights. If provided, these weights will be used to
initialize the convolution layer and batch normalization layer (if present). Defaults to None.
Returns:
None
Note:
TensorFlow Conv2D does not support the 'groups' argument when using versions prior to 2.2.
See https://stackoverflow.com/questions/52975843/comparing-conv2d-with-padding-between-tensorflow
-and-pytorch for convolution padding inconsistencies between TensorFlow and PyTorch.
"""
super().__init__()
assert g == 1, "TF v2.2 Conv2D does not support 'groups' argument"
# TensorFlow convolution padding is inconsistent with PyTorch (e.g. k=3 s=2 'SAME' padding)
# see https://stackoverflow.com/questions/52975843/comparing-conv2d-with-padding-between-tensorflow-and-pytorch
conv = keras.layers.Conv2D(
filters=c2,
kernel_size=k,
strides=s,
padding="SAME" if s == 1 else "VALID",
use_bias=not hasattr(w, "bn"),
kernel_initializer=keras.initializers.Constant(w.conv.weight.permute(2, 3, 1, 0).numpy()),
bias_initializer="zeros" if hasattr(w, "bn") else keras.initializers.Constant(w.conv.bias.numpy()),
)
self.conv = conv if s == 1 else keras.Sequential([TFPad(autopad(k, p)), conv])
self.bn = TFBN(w.bn) if hasattr(w, "bn") else tf.identity
self.act = activations(w.act) if act else tf.identity
def call(self, inputs):
"""
Apply convolution, batch normalization, and activation functions to input tensors in sequence.
Args:
inputs (tf.Tensor): Input tensor with shape (N, H, W, C) where N is the batch size, H is the height, W is
the width, and C is the number of channels.
Returns:
(tf.Tensor): Transformed tensor post convolution, batch normalization, and activation function, with shape
typically modified based on kernel size, stride, and padding.
Example:
```python
# Example usage of TFConv
conv_layer = TFConv(c1=3, c2=16, k=3, s=1, act=True)
output = conv_layer(tf.random.uniform([1, 64, 64, 3])) # Output tensor after convolution operations
```
Note:
This function performs operations sequentially: convolution, batch normalization (if specified), and
activation. Ensure input tensor dimensions match expected shape for compatibility.
"""
return self.act(self.bn(self.conv(inputs)))
class TFDWConv(keras.layers.Layer):
# Depthwise convolution
def __init__(self, c1, c2, k=1, s=1, p=None, act=True, w=None):
"""
Initialize a depthwise convolution layer with optional batch normalization and activation for TensorFlow models.
Args:
c1 (int): Number of input channels.
c2 (int): Number of output channels. Must be a multiple of `c1`.
k (int): Kernel size. Defaults to 1.
s (int): Stride value. Defaults to 1.
p (int | tuple[int, int] | None): Padding size. Defaults to None, which means auto padding is applied.
act (bool): Whether to apply an activation function. Defaults to True.
w (torch.nn.Module | None): Pretrained weights. Defaults to None.
Returns:
(None): This initializer does not return a value. It configures the layer properties.
Example:
```python
tfdwconv_layer = TFDWConv(c1=32, c2=64, k=3, s=1, p=1, act=True, w=pretrained_weights)
```
Note:
TensorFlow's depthwise convolution implementation requires `c2` to be a multiple of `c1`. Ensure that
this condition is met when specifying the number of output channels.
"""
super().__init__()
assert c2 % c1 == 0, f"TFDWConv() output={c2} must be a multiple of input={c1} channels"
conv = keras.layers.DepthwiseConv2D(
kernel_size=k,
depth_multiplier=c2 // c1,
strides=s,
padding="SAME" if s == 1 else "VALID",
use_bias=not hasattr(w, "bn"),
depthwise_initializer=keras.initializers.Constant(w.conv.weight.permute(2, 3, 1, 0).numpy()),
bias_initializer="zeros" if hasattr(w, "bn") else keras.initializers.Constant(w.conv.bias.numpy()),
)
self.conv = conv if s == 1 else keras.Sequential([TFPad(autopad(k, p)), conv])
self.bn = TFBN(w.bn) if hasattr(w, "bn") else tf.identity
self.act = activations(w.act) if act else tf.identity
def call(self, inputs):
"""
Applies depthwise convolution, batch normalization, and activation to input tensors in TensorFlow models.
Args:
inputs (tf.Tensor): Input tensor to the layer, with shape (N, H, W, C) where N is the batch size, H is height,
W is width, and C is the number of channels.
Returns:
(tf.Tensor): Processed tensor after applying depthwise convolution, batch normalization, and activation function,
with appropriate padding if necessary.
Example:
```python
import tensorflow as tf
from pretrained_weights import get_weights # hypothetical function to get pretrained weights
input_tensor = tf.random.normal([1, 224, 224, 32]) # Example input tensor
# Assuming predefined weights
pretrained_weights = get_weights('path/to/weights.pth')
depthwise_conv_layer = TFDWConv(c1=32, c2=64, k=3, s=1, p=1, act=True, w=pretrained_weights)
output_tensor = depthwise_conv_layer(input_tensor)
print(output_tensor.shape) # Should output the shape after application of the layer
```
Notes:
- Ensure `c2` is a multiple of `c1` to satisfy TensorFlow's depthwise convolution requirements.
- Padding is handled to ensure the input and output dimensions align as per specifications.
"""
return self.act(self.bn(self.conv(inputs)))
class TFDWConvTranspose2d(keras.layers.Layer):
# Depthwise ConvTranspose2d
def __init__(self, c1, c2, k=1, s=1, p1=0, p2=0, w=None):
"""
Initialize depthwise ConvTranspose2D layer with specific channel, kernel, stride, and padding settings.
Args:
c1 (int): Number of input channels.
c2 (int): Number of output channels, which must be equal to `c1`.
k (int): Kernel size, must be 4.
s (int): Stride size.
p1 (int): Padding size, must be 1.
p2 (int): Output padding size for the transposed convolution.
w (torch.nn.Module): PyTorch layer containing pretrained weights.
Returns:
(None): Initializes the depthwise ConvTranspose2D layer with the specified parameters.
Example:
```python
# Assuming `w` contains pretrained weights compatible with the layer
trans_conv_layer = TFDWConvTranspose2d(c1=64, c2=64, k=4, s=2, p1=1, p2=0, w=some_pretrained_layer)
```
Notes:
- The kernel size `k` must be 4, and padding `p1` must be 1 for the layer to function correctly.
- The number of input channels `c1` must equal the number of output channels `c2`.
- Ensure the provided weights `w` match the layer's specifications before initialization.
"""
super().__init__()
assert c1 == c2, f"TFDWConv() output={c2} must be equal to input={c1} channels"
assert k == 4 and p1 == 1, "TFDWConv() only valid for k=4 and p1=1"
weight, bias = w.weight.permute(2, 3, 1, 0).numpy(), w.bias.numpy()
self.c1 = c1
self.conv = [
keras.layers.Conv2DTranspose(
filters=1,
kernel_size=k,
strides=s,
padding="VALID",
output_padding=p2,
use_bias=True,
kernel_initializer=keras.initializers.Constant(weight[..., i : i + 1]),
bias_initializer=keras.initializers.Constant(bias[i]),
)
for i in range(c1)
]
def call(self, inputs):
"""
Apply a depthwise transposed convolution to input tensors in TensorFlow models, concatenating outputs.
Args:
inputs (tf.Tensor): The input tensor with shape (N, H, W, C) where N is batch size, H is height, W is width,
and C is the number of input channels.
Returns:
(tf.Tensor): Processed tensor with dimensions affected by the transposed convolution operation and concatenation
along the channel dimension.
Example:
```python
inputs = tf.random.normal([1, 32, 32, 64]) # Example input tensor
trans_conv_layer = TFDWConvTranspose2d(c1=64, c2=64, k=4, s=2, p1=1, p2=0, w=pretrained_weights)
output = trans_conv_layer(inputs) # Output tensor after transposed convolution
```
Note:
The input tensor must have the same number of input and output channels. The kernel size (k) must be 4 and
the padding (p1) must be 1 to match the specific configuration supported by the module.
"""
return tf.concat([m(x) for m, x in zip(self.conv, tf.split(inputs, self.c1, 3))], 3)[:, 1:-1, 1:-1]
class TFFocus(keras.layers.Layer):
# Focus wh information into c-space
def __init__(self, c1, c2, k=1, s=1, p=None, g=1, act=True, w=None):
"""
Initialize `TFFocus` layer to focus width and height information into channel space.
Args:
c1 (int): Number of input channels.
c2 (int): Number of output channels.
k (int, optional): Kernel size for the convolution. Default is 1.
s (int, optional): Stride for the convolution. Default is 1.
p (int | tuple[int, int], optional): Padding for the convolution. Default is None.
g (int, optional): Number of groups for the convolution (must be 1). Default is 1.
act (bool, optional): Whether to use activation function. Default is True.
w (torch.nn.Module, optional): Pretrained weights for the convolution. Default is None.
Returns:
None
Example:
```python
tf_focus = TFFocus(c1=64, c2=128, k=3, s=1, p=1, act=True)
output = tf_focus(input_tensor)
```
Note:
The `TFFocus` layer reduces the spatial dimensions by focusing width and height information into the channel
space before further processing. This operation helps in reducing the spatial complexity while retaining
essential features in the channel dimensions.
"""
super().__init__()
self.conv = TFConv(c1 * 4, c2, k, s, p, g, act, w.conv)
def call(self, inputs):
"""
Focus width and height information into channel space and apply convolution.
Args:
inputs (tf.Tensor): Input tensor of shape (B, W, H, C) where B is the batch size, W and H are the spatial
dimensions, and C is the number of channels.
Returns:
(tf.Tensor): Output tensor after applying pixel shuffling and convolution, with shape (B, W/2, H/2, 4C).
Example:
```python
layer = TFFocus(c1=64, c2=128, k=1, s=1, p=None, g=1, act=True, w=None)
input_tensor = tf.random.normal((1, 128, 128, 64))
output_tensor = layer(input_tensor)
```
Note:
The input tensor is downsampled by a factor of 2 along the width and height dimensions and the number of channels is
expanded by a factor of 4.
"""
inputs = [inputs[:, ::2, ::2, :], inputs[:, 1::2, ::2, :], inputs[:, ::2, 1::2, :], inputs[:, 1::2, 1::2, :]]
return self.conv(tf.concat(inputs, 3))
class TFBottleneck(keras.layers.Layer):
# Standard bottleneck
def __init__(self, c1, c2, shortcut=True, g=1, e=0.5, w=None):
"""
Perform non-maximum suppression (NMS) on prediction boxes.
Args:
c1 (int): Number of input channels.
c2 (int): Number of output channels.
shortcut (bool): Whether to use a residual (shortcut) connection. Default is True.
g (int): Number of groups for grouped convolution. Default is 1.
e (float): Expansion ratio to calculate the number of hidden channels. Default is 0.5.
w (torch.nn.Module | None): Pretrained weights for the PyTorch model, used to initialize TensorFlow layers.
Returns:
(None): This constructor does not return a value. It initializes the bottleneck layer parameters.
Example:
```python
# Example of initializing TFBottleneck with specified parameters
bottleneck_layer = TFBottleneck(c1=128, c2=256, shortcut=True, g=1, e=0.5, w=pretrained_weights)
```
"""
super().__init__()
c_ = int(c2 * e) # hidden channels
self.cv1 = TFConv(c1, c_, 1, 1, w=w.cv1)
self.cv2 = TFConv(c_, c2, 3, 1, g=g, w=w.cv2)
self.add = shortcut and c1 == c2
def call(self, inputs):
"""
Applies a bottleneck transformation with optional skip connection in TensorFlow models.
Args:
inputs (tf.Tensor): Input tensor with shape (B, H, W, C), where B is the batch size, H is the height,
W is the width, and C is the number of channels.
Returns:
(tf.Tensor): Output tensor with bottleneck transformation applied, maintaining the same spatial dimensions
but with possibly different number of channels C_out.
Example:
```python
bottleneck_layer = TFBottleneck(c1=64, c2=128, shortcut=True, g=1, e=0.5, w=pretrained_weights)
output = bottleneck_layer(input_tensor)
```
Note:
The transformation includes two convolutional layers with optional ReLU activation and batch normalization.
If `shortcut` is enabled and the input and output channels match, a skip connection adds the input directly
to the output.
"""
return inputs + self.cv2(self.cv1(inputs)) if self.add else self.cv2(self.cv1(inputs))
class TFCrossConv(keras.layers.Layer):
# Cross Convolution
def __init__(self, c1, c2, k=3, s=1, g=1, e=1.0, shortcut=False, w=None):
"""
Initialize a cross convolution layer with optional expansion, groups, and shortcut functionality.
Args:
c1 (int): Number of input channels.
c2 (int): Number of output channels.
k (int, optional): Kernel size for the convolution. Defaults to 3.
s (int, optional): Stride for the convolution. Defaults to 1.
g (int, optional): Number of groups for grouped convolution. Defaults to 1.
e (float, optional): Expansion ratio to determine the hidden channels. Defaults to 1.0.
shortcut (bool, optional): Whether to use a residual (shortcut) connection. Defaults to False.
w (torch.nn.Module | None, optional): Pretrained PyTorch weights for the corresponding convolutional layers.
Defaults to None.
Returns:
None
Example:
```python
tfxconv = TFCrossConv(c1=64, c2=128, k=3, s=1, g=1, e=1.0, shortcut=False, w=pretrained_weights)
output = tfxconv(input_tensor)
```
Note:
- The cross convolution layer consists of two consecutive convolution operations with kernel shapes (1, k) and (k, 1).
- The `shortcut` option adds a residual connection if input and output channels are equal.
- The expansion ratio `e` controls the number of hidden channels: hidden channels = int(c2 * e).
"""
super().__init__()
c_ = int(c2 * e) # hidden channels
self.cv1 = TFConv(c1, c_, (1, k), (1, s), w=w.cv1)
self.cv2 = TFConv(c_, c2, (k, 1), (s, 1), g=g, w=w.cv2)
self.add = shortcut and c1 == c2
def call(self, inputs):
"""
Performs cross convolution operations with optional shortcut connections in TensorFlow models.
Args:
inputs (tf.Tensor): Input tensor with shape (B, H, W, C), where B is the batch size, H is the height,
W is the width, and C is the number of channels.
Returns:
(tf.Tensor): Output tensor after applying the cross convolution operations, with shape (B, H_out, W_out, C_out),
where H_out and W_out may differ from H and W depending on the convolution parameters, and C_out is the number
of output channels.
Example:
```python
tf_cross_conv = TFCrossConv(c1=64, c2=128, k=3, s=1, g=1, e=1.0, shortcut=True, w=pretrained_weights)
output_tensor = tf_cross_conv(input_tensor)
```
Note:
- The cross convolution operation involves two separable convolutions: one with a kernel size of (1, k) and another
with (k, 1).
- If the shortcut connection is enabled (`shortcut=True`) and the input and output channels are equal (`c1 == c2`),
the input tensor is added directly to the output after the convolution operations.
- This layer allows flexible expansion ratios (`e`) and supports grouped convolutions (`g`).
"""
return inputs + self.cv2(self.cv1(inputs)) if self.add else self.cv2(self.cv1(inputs))
class TFConv2d(keras.layers.Layer):
# Substitution for PyTorch nn.Conv2D
def __init__(self, c1, c2, k, s=1, g=1, bias=True, w=None):
"""
Initialize a TensorFlow 2D convolution layer as an equivalent replacement for PyTorch's nn.Conv2D, without group
convolutions.
Args:
c1 (int): Number of input channels.
c2 (int): Number of output channels.
k (int | tuple[int, int]): Kernel size for the convolution.
s (int, optional): Stride size for the convolution. Defaults to 1.
g (int, optional): Number of blocked connections from input channels to output channels (groups). Must be 1 as
TensorFlow Conv2D does not support groups. Defaults to 1.
bias (bool, optional): Whether to include a bias term in the convolution. Defaults to True.
w (torch.nn.Conv2d | None, optional): Weights from a pre-trained PyTorch Conv2d layer to initialize this layer.
Defaults to None.
Returns:
(TFConv2d): A TensorFlow 2D convolution layer initialized with the specified parameters and pre-trained weights
if provided.
Example:
```python
from tensorflow.keras import Model
from tensorflow.keras.layers import Input
input_layer = Input(shape=(224, 224, 3))
conv_layer = TFConv2d(3, 16, 3)
output = conv_layer(input_layer)
model = Model(inputs=input_layer, outputs=output)
```
Note:
TensorFlow Conv2D does not support group convolutions. The layer parameters should ensure that the number of
groups (`g`) is set to 1.
"""
super().__init__()
assert g == 1, "TF v2.2 Conv2D does not support 'groups' argument"
self.conv = keras.layers.Conv2D(
filters=c2,
kernel_size=k,
strides=s,
padding="VALID",
use_bias=bias,
kernel_initializer=keras.initializers.Constant(w.weight.permute(2, 3, 1, 0).numpy()),
bias_initializer=keras.initializers.Constant(w.bias.numpy()) if bias else None,
)
def call(self, inputs):
"""
Perform convolution operation on input tensors, mimicking PyTorch's nn.Conv2d functionality.
Args:
inputs (tf.Tensor): Input tensor of shape (B, H, W, C), where B is batch size, H is height, W is width, and C is channels.
Returns:
(tf.Tensor): Output tensor resulting from convolution operation, maintaining dimensions (B, H, W, C).
Example:
```python
from tensorflow.keras import Model, Input
from my_ultralytics_module import TFConv2d
input_layer = Input(shape=(224, 224, 3))
conv_layer = TFConv2d(3, 16, 3)
output_tensor = conv_layer(input_layer)
model = Model(inputs=input_layer, outputs=output_tensor)
```
Notes:
The convolution operation assumes 'VALID' padding, meaning no padding is added to the input tensor. Ensure input tensor
dimensions and kernel size are appropriately configured for this setting.
"""
return self.conv(inputs)
class TFBottleneckCSP(keras.layers.Layer):
# CSP Bottleneck https://github.com/WongKinYiu/CrossStagePartialNetworks
def __init__(self, c1, c2, n=1, shortcut=True, g=1, e=0.5, w=None):
"""
Initialize a CSP bottleneck layer with specified parameters for TensorFlow models, supporting optional shortcut
connections.
Args:
c1 (int): Number of input channels.
c2 (int): Number of output channels.
n (int): Number of Bottleneck layers to stack. Default is 1.
shortcut (bool): If True, adds shortcut connections in bottleneck layers. Default is True.
g (int): Number of groups for grouped convolutions. Default is 1.
e (float): Expansion ratio for hidden layer dimensionality. Default is 0.5.
w (object): Pretrained weights struct containing TensorFlow layer weights. Default is None.
Returns:
None
Example:
```python
csp_layer = TFBottleneckCSP(c1=64, c2=128, n=3, shortcut=True, g=1, e=0.5, w=pretrained_weights)
output = csp_layer(input_tensor)
```
Note:
This layer combines Cross Stage Partial Networks (CSPNet) principles which split the feature map
into two parts, enabling improved gradient flow and reduction in computation cost.
"""
super().__init__()
c_ = int(c2 * e) # hidden channels
self.cv1 = TFConv(c1, c_, 1, 1, w=w.cv1)
self.cv2 = TFConv2d(c1, c_, 1, 1, bias=False, w=w.cv2)
self.cv3 = TFConv2d(c_, c_, 1, 1, bias=False, w=w.cv3)
self.cv4 = TFConv(2 * c_, c2, 1, 1, w=w.cv4)
self.bn = TFBN(w.bn)
self.act = lambda x: keras.activations.swish(x)
self.m = keras.Sequential([TFBottleneck(c_, c_, shortcut, g, e=1.0, w=w.m[j]) for j in range(n)])
def call(self, inputs):
"""
Combines CSP bottleneck layers and processes input tensors through convolution, bottleneck, and activation.
Args:
inputs (tf.Tensor): Input tensor to be processed with shape (N, H, W, C) where N is the batch size, H is height,
W is width, and C is the number of channels.
Returns:
(tf.Tensor): Output tensor after combining CSP bottleneck layers, with shape (N, H, W, C_out).
Example:
```python
# Assuming `inputs` is a pre-existing tensor
csp_layer = TFBottleneckCSP(c1=64, c2=128, n=3, shortcut=True, g=1, e=0.5, w=pretrained_weights)
output = csp_layer(inputs)
```
Note:
This method concatenates the results from multiple bottleneck layers and applies normalization and activation
functions to produce the final output.
"""
y1 = self.cv3(self.m(self.cv1(inputs)))
y2 = self.cv2(inputs)
return self.cv4(self.act(self.bn(tf.concat((y1, y2), axis=3))))
class TFC3(keras.layers.Layer):
# CSP Bottleneck with 3 convolutions
def __init__(self, c1, c2, n=1, shortcut=True, g=1, e=0.5, w=None):
"""
Initializes the CSP bottleneck layer with 3 convolutions, with optional shortcuts and group convolutions.
Args:
c1 (int): Number of input channels.
c2 (int): Number of output channels.
n (int): Number of bottleneck layers within the CSP block.
shortcut (bool): If True, add a shortcut to the bottleneck layers. Default is True.
g (int): Number of groups in group convolutions. Default is 1.
e (float): Expansion ratio for hidden channels. Default is 0.5.
w (object | None): Pre-trained weights for initializing the layer. If None, random initialization is used. Default
is None.
Returns:
(None): This initializer does not return a value, it configures the layer properties.
Example:
```python
layer = TFC3(128, 256, n=3, shortcut=False, g=1, e=0.5, w=pretrained_weights)
input_tensor = tf.random.normal((1, 128, 128, 128)) # example input tensor
output_tensor = layer(input_tensor) # forward pass
```
"""
super().__init__()
c_ = int(c2 * e) # hidden channels
self.cv1 = TFConv(c1, c_, 1, 1, w=w.cv1)
self.cv2 = TFConv(c1, c_, 1, 1, w=w.cv2)
self.cv3 = TFConv(2 * c_, c2, 1, 1, w=w.cv3)
self.m = keras.Sequential([TFBottleneck(c_, c_, shortcut, g, e=1.0, w=w.m[j]) for j in range(n)])
def call(self, inputs):
"""
Perform forward pass through CSP Bottleneck with 3 convolutions for object detection.
Args:
inputs (tf.Tensor): Input tensor of shape (N, H, W, C) where N is the batch size, H is the height, W is
the width, and C is the number of channels.
Returns:
(tf.Tensor): Output tensor after CSP Bottleneck processing, with shape dependent on the layer
configuration, generally retaining the batch size N.
Note:
The implemented CSP (Cross Stage Partial) Bottleneck with 3 convolutions helps in reducing
computational complexity while retaining essential features for object detection.
Example:
```python
layer = TFC3(c1=128, c2=256, n=3, shortcut=False, g=1, e=0.5, w=pretrained_weights)
output_tensor = layer(input_tensor)
```
Refer to the YOLOv5 repository for more details:
https://github.com/ultralytics/yolov5
"""
return self.cv3(tf.concat((self.m(self.cv1(inputs)), self.cv2(inputs)), axis=3))
class TFC3x(keras.layers.Layer):
# 3 module with cross-convolutions
def __init__(self, c1, c2, n=1, shortcut=True, g=1, e=0.5, w=None):
"""
Summary:
Initialize the TFC3x layer with cross-convolutions for enhanced feature extraction in deep learning models.
Args:
c1 (int): Number of input channels.
c2 (int): Number of output channels.
n (int): Number of sub-layers to stack within the module.
shortcut (bool): Indicates whether to use residual connections across sub-layers. Defaults to True.
g (int): Number of groups for grouped convolution. Defaults to 1.
e (float): Expansion ratio for hidden channels relative to the output channels. Defaults to 0.5.
w (nn.Module | None): Pretrained weights for initializing the layer if available. Defaults to None.
Returns:
None
Example:
```python
# Initialize TFC3x layer with 64 input channels, 128 output channels, stacking 3 sub-layers,
# using residual connections, and optional pretrained weights.
layer = TFC3x(c1=64, c2=128, n=3, shortcut=True, g=1, e=0.5, w=pretrained_weights)
```
Note:
The TFC3x layer is designed for advanced convolutional neural network (CNN) architectures, particularly for tasks
requiring efficient and enhanced feature extraction, such as object detection. The cross-convolution mechanism
employed within this layer aims to boost the model's representational capacity by incorporating unique feature
interactions across multiple sub-layers. For further information, refer to the YOLOv5 repository at
https://github.com/ultralytics/yolov5. This layer can be combined with other TensorFlow components to build custom
CNN models.
"""
super().__init__()
c_ = int(c2 * e) # hidden channels
self.cv1 = TFConv(c1, c_, 1, 1, w=w.cv1)
self.cv2 = TFConv(c1, c_, 1, 1, w=w.cv2)
self.cv3 = TFConv(2 * c_, c2, 1, 1, w=w.cv3)
self.m = keras.Sequential(
[TFCrossConv(c_, c_, k=3, s=1, g=g, e=1.0, shortcut=shortcut, w=w.m[j]) for j in range(n)]
)
def call(self, inputs):
"""
Processes input through cascaded cross-convolutions and merges features for enhanced object detection.
Args:
inputs (tf.Tensor): Input tensor of shape (N, H, W, C) where N is the batch size, H and W are the spatial dimensions,
and C is the number of input channels.
Returns:
(tf.Tensor): Output tensor after applying cross-convolutions and concatenation with shape (N, H, W, C_out) where
C_out is the number of output channels, as configured during initialization.
Example:
```python
import tensorflow as tf
# Example initialization of TFC3x layer
layer = TFC3x(c1=64, c2=128, n=2, shortcut=True, g=1, e=0.5, w=None)
input_tensor = tf.random.normal((1, 128, 128, 64)) # Create a random tensor as input
output_tensor = layer(input_tensor) # Process the input tensor through the TFC3x layer
```
Notes:
For details on the cross convolutional structure and its application, visit the YOLOv5 repository at
https://github.com/ultralytics/yolov5.
"""
return self.cv3(tf.concat((self.m(self.cv1(inputs)), self.cv2(inputs)), axis=3))
class TFSPP(keras.layers.Layer):
# Spatial pyramid pooling layer used in YOLOv3-SPP
def __init__(self, c1, c2, k=(5, 9, 13), w=None):
"""
Initialize a Spatial Pyramid Pooling (SPP) layer for YOLOv3 with specified input/output channels and kernel
sizes.
Args:
c1 (int): Number of input channels.
c2 (int): Number of output channels.
k (tuple[int, int, int]): Kernel sizes for max pooling layers. Defaults to (5, 9, 13).
w (object | None): Pre-trained weights for initializing the convolutional layers.
Returns:
None
Example:
```python
spp_layer = TFSPP(c1=1024, c2=512, k=(5, 9, 13), w=pretrained_weights)
output = spp_layer(input_tensor)
```
Note:
- The SPP layer enhances the receptive field during feature extraction by applying multiple max-pooling operations with different kernel sizes.
- For more information, refer to the YOLOv5 repository at https://github.com/ultralytics/yolov5.
"""
super().__init__()
c_ = c1 // 2 # hidden channels
self.cv1 = TFConv(c1, c_, 1, 1, w=w.cv1)
self.cv2 = TFConv(c_ * (len(k) + 1), c2, 1, 1, w=w.cv2)
self.m = [keras.layers.MaxPool2D(pool_size=x, strides=1, padding="SAME") for x in k]
def call(self, inputs):
"""
Applies spatial pyramid pooling (SPP) to the input, enhancing receptive field.
Args:
inputs (tf.Tensor): Input tensor of shape (B, H, W, C), where B is the batch size, H is the height, W is the width,
and C is the number of channels.
Returns:
(tf.Tensor): Output tensor after applying SPP, with shape (B, H, W, C_out), where C_out is the number of output
channels.
Example:
```python
tf_spp_layer = TFSPP(c1=128, c2=256, k=(5, 9, 13))
input_tensor = tf.random.normal((1, 64, 64, 128))
output_tensor = tf_spp_layer(input_tensor)
```
Note:
This function enhances the receptive field by concatenating max-pooled output with original input, providing richer
spatial context for object detection.
"""
x = self.cv1(inputs)
return self.cv2(tf.concat([x] + [m(x) for m in self.m], 3))
class TFSPPF(keras.layers.Layer):
# Spatial pyramid pooling-Fast layer
def __init__(self, c1, c2, k=5, w=None):
"""
Initialize a fast spatial pyramid pooling layer for TensorFlow, specifying the input and output channels, kernel
size, and weights.
Args:
c1 (int): Number of input channels.
c2 (int): Number of output channels.
k (int): Kernel size for the pooling layers.
w (TFConv | None): Pretrained weights for the convolutional layers, if available.
Returns:
None: Initializes the TFSPPF layer, ready for use in forward passes for YOLO models.
Example:
```python
c1, c2, k = 512, 1024, 5
sppf_layer = TFSPPF(c1, c2, k)
input_tensor = tf.random.normal((1, 64, 64, 512)) # Example input tensor
output_tensor = sppf_layer(input_tensor)
```
"""
super().__init__()
c_ = c1 // 2 # hidden channels
self.cv1 = TFConv(c1, c_, 1, 1, w=w.cv1)
self.cv2 = TFConv(c_ * 4, c2, 1, 1, w=w.cv2)
self.m = keras.layers.MaxPool2D(pool_size=k, strides=1, padding="SAME")
def call(self, inputs):
"""
Perform forward pass using fast spatial pyramid pooling (SPPF) layer on input tensors for feature extraction.
Args:
inputs (tf.Tensor): Input tensor with shape (B, H, W, C) representing the batch size, height, width, and channels
of the images.
Returns:
(tf.Tensor): Output tensor with shape (B, H, W, C_out) after concatenation of max-pooled features and final
convolution.
Example:
```python
inputs = tf.random.normal((1, 256, 256, 64))
sppf = TFSPPF(64, 128, k=5)
outputs = sppf(inputs)
print(outputs.shape) # Example output shape: (1, 256, 256, 128)
```
Note:
This layer is designed for efficient spatial pyramid pooling by concatenating features from multiple receptive
fields, enhancing the network's understanding of spatial hierarchies.
"""
x = self.cv1(inputs)
y1 = self.m(x)
y2 = self.m(y1)
return self.cv2(tf.concat([x, y1, y2, self.m(y2)], 3))
class TFDetect(keras.layers.Layer):
# TF YOLOv5 Detect layer
def __init__(self, nc=80, anchors=(), ch=(), imgsz=(640, 640), w=None):
"""
Initialize the TensorFlow YOLOv5 detection layer with specified parameters for classes, anchors, channels, and
image size.
Args:
nc (int): Number of classes for detection.
anchors (tuple): Anchor boxes defined as a tuple of tuples, e.g., ((10, 13), (16, 30), (33, 23)).
ch (tuple): Number of channels for each detection layer.
imgsz (tuple): Image size as a tuple (height, width), e.g., (640, 640).
w (nn.Module): Pretrained PyTorch model weights used to initialize the TensorFlow layer.
Returns:
None
Example:
```python
anchors = ((10, 13), (16, 30), (33, 23))
ch = (128, 256, 512)
imgsz = (640, 640)
nc = 80 # Number of classes for detection
yolo_layer = TFDetect(nc=nc, anchors=anchors, ch=ch, imgsz=imgsz, w=pretrained_model)
```
Note:
Ensure that the pretrained PyTorch weights (`w`) are correctly converted for TensorFlow initialization. The model
should be set to evaluation mode if not in training.
"""
super().__init__()
self.stride = tf.convert_to_tensor(w.stride.numpy(), dtype=tf.float32)
self.nc = nc # number of classes
self.no = nc + 5 # number of outputs per anchor
self.nl = len(anchors) # number of detection layers
self.na = len(anchors[0]) // 2 # number of anchors
self.grid = [tf.zeros(1)] * self.nl # init grid
self.anchors = tf.convert_to_tensor(w.anchors.numpy(), dtype=tf.float32)
self.anchor_grid = tf.reshape(self.anchors * tf.reshape(self.stride, [self.nl, 1, 1]), [self.nl, 1, -1, 1, 2])
self.m = [TFConv2d(x, self.no * self.na, 1, w=w.m[i]) for i, x in enumerate(ch)]
self.training = False # set to False after building model
self.imgsz = imgsz
for i in range(self.nl):
ny, nx = self.imgsz[0] // self.stride[i], self.imgsz[1] // self.stride[i]
self.grid[i] = self._make_grid(nx, ny)
def call(self, inputs):
"""
Applies detection layer forward pass to perform object detection in TensorFlow models.
Args:
inputs (list[tf.Tensor]): List of input tensors for each detection layer, with shape (B, H, W, C).
Each tensor corresponds to a different detection layer, where B is the batch size, H and W are
height and width, and C is the channel number.
Returns:
(list[tf.Tensor]): List of outputs for detected bounding boxes and class scores, each having shape
(B, N, A * (5 + nc)) where N = H * W is the number of grid cells, A is the number of anchors,
and nc is the number of classes.
Example:
```python
inputs = [tf.random.normal((1, 20, 20, 255)), tf.random.normal((1, 40, 40, 255))]
detection_layer = TFDetect(nc=80, anchors=[(10,13), (16,30), (33,23)], ch=(255, 255), imgsz=(640, 640))
outputs = detection_layer.call(inputs)
for out in outputs:
print(out.shape)
```
Note:
This function will only execute the inference path and will not perform training.
For more details on the YOLOv5 model implementation, visit https://github.com/ultralytics/yolov5.
"""
z = [] # inference output
x = []
for i in range(self.nl):
x.append(self.m[i](inputs[i]))
# x(bs,20,20,255) to x(bs,3,20,20,85)
ny, nx = self.imgsz[0] // self.stride[i], self.imgsz[1] // self.stride[i]
x[i] = tf.reshape(x[i], [-1, ny * nx, self.na, self.no])
if not self.training: # inference
y = x[i]
grid = tf.transpose(self.grid[i], [0, 2, 1, 3]) - 0.5
anchor_grid = tf.transpose(self.anchor_grid[i], [0, 2, 1, 3]) * 4
xy = (tf.sigmoid(y[..., 0:2]) * 2 + grid) * self.stride[i] # xy
wh = tf.sigmoid(y[..., 2:4]) ** 2 * anchor_grid
# Normalize xywh to 0-1 to reduce calibration error
xy /= tf.constant([[self.imgsz[1], self.imgsz[0]]], dtype=tf.float32)
wh /= tf.constant([[self.imgsz[1], self.imgsz[0]]], dtype=tf.float32)
y = tf.concat([xy, wh, tf.sigmoid(y[..., 4 : 5 + self.nc]), y[..., 5 + self.nc :]], -1)
z.append(tf.reshape(y, [-1, self.na * ny * nx, self.no]))
return tf.transpose(x, [0, 2, 1, 3]) if self.training else (tf.concat(z, 1),)
@staticmethod
def _make_grid(nx=20, ny=20):
"""
Generate a 2D grid of coordinates for given dimensions (nx, ny).
Args:
nx (int): Number of grid columns.
ny (int): Number of grid rows.
Returns:
(tf.Tensor): A tensor of shape (1, ny, nx, 2) representing the grid coordinates,
where the last dimension contains the (x, y) coordinates for each grid point.
Example:
```python
grid = TFDetect._make_grid(nx=3, ny=2)
print(grid.numpy())
# Output:
# [[[[0 0]
# [1 0]
# [2 0]]
#
# [[0 1]
# [1 1]
# [2 1]]]]
```
Note:
The returned grid tensor is useful for aligning bounding box predictions
with the spatial dimensions of the feature map.
"""
# return torch.stack((xv, yv), 2).view((1, 1, ny, nx, 2)).float()
xv, yv = tf.meshgrid(tf.range(nx), tf.range(ny))
return tf.cast(tf.reshape(tf.stack([xv, yv], 2), [1, 1, ny * nx, 2]), dtype=tf.float32)
class TFSegment(TFDetect):
# YOLOv5 Segment head for segmentation models
def __init__(self, nc=80, anchors=(), nm=32, npr=256, ch=(), imgsz=(640, 640), w=None):
"""
Initialize YOLOv5 Segment head with specified channel depths, anchors, and input size for segmentation models.
Args:
nc (int): Number of classes to detect.
anchors (tuple): Anchors used for YOLO detection, defined as a tuple of tuple(s) with dimensions
(anchor_count, 2).
nm (int): Number of masks for segmentation.
npr (int): Number of prototypes for mask generation.
ch (tuple[int]): Tuple containing the number of input channels for each detection layer.
imgsz (tuple[int, int]): Input image size in the format (height, width).
w (object): Weights object containing predetermined network weights.
Returns:
None
Example:
```python
segment = TFSegment(nc=80, anchors=((10, 13), (16, 30), (33, 23)), nm=32, npr=256,
ch=(256, 512, 1024), imgsz=(640, 640), w=pretrained_weights)
```
Note:
This class inherits from `TFDetect` and extends it for segmentation tasks, adding mask prediction capabilities.
The initialization includes setting up mask-related parameters and an additional proto layer for mask prediction.
"""
super().__init__(nc, anchors, ch, imgsz, w)
self.nm = nm # number of masks
self.npr = npr # number of protos
self.no = 5 + nc + self.nm # number of outputs per anchor
self.m = [TFConv2d(x, self.no * self.na, 1, w=w.m[i]) for i, x in enumerate(ch)] # output conv
self.proto = TFProto(ch[0], self.npr, self.nm, w=w.proto) # protos
self.detect = TFDetect.call
def call(self, x):
"""
Apply YOLOv5 segmentation head, including detection and prototype layers, to the input tensor.
Args:
x (list[tf.Tensor]): List of input tensors from the previous layer, with each tensor in the list having shape
(B, H, W, C), where B is the batch size, H and W are the height and width, and C is the number of channels.
Returns:
(tuple[tf.Tensor, tf.Tensor]): A tuple containing:
- Detection tensor with shape (N, A, G, no), where N is batch size, A is number of anchors, G is grid size,
and `no` is the number of outputs per anchor (no = 5 + number of classes + number of masks).
- Prototype tensor with shape (B, nm, G, G) for segmentation masks where nm is the number of masks and
G is the grid size.
Example:
```python
segment = TFSegment(nc=80, anchors=((10, 13), (16, 30), (33, 23)), nm=32, npr=256, ch=(256, 512, 1024),
imgsz=(640, 640), w=pretrained_weights)
detection, prototype = segment.call([input_tensor_1, input_tensor_2, input_tensor_3])
```
Note:
This method overrides the `call` method of `TFDetect` class to include a prototype layer for mask prediction.
The output includes both detection and mask predictions, with masks being generated from the prototype tensor
processed through the network.
"""
p = self.proto(x[0])
# p = TFUpsample(None, scale_factor=4, mode='nearest')(self.proto(x[0])) # (optional) full-size protos
p = tf.transpose(p, [0, 3, 1, 2]) # from shape(1,160,160,32) to shape(1,32,160,160)
x = self.detect(self, x)
return (x, p) if self.training else (x[0], p)
class TFProto(keras.layers.Layer):
def __init__(self, c1, c_=256, c2=32, w=None):
"""
Initialize TFProto layer composed of convolutional and upsampling layers for feature extraction and
transformation.
Args:
c1 (int): Number of input channels.
c_ (int, optional): Number of intermediate channels. Defaults to 256.
c2 (int, optional): Number of output channels. Defaults to 32.
w (torch.nn.Module | None): Pretrained weights to initialize the TensorFlow convolution layers.
Note:
This layer is utilized within the YOLOv5 model for processing segmentation-specific features.
"""
super().__init__()
self.cv1 = TFConv(c1, c_, k=3, w=w.cv1)
self.upsample = TFUpsample(None, scale_factor=2, mode="nearest")
self.cv2 = TFConv(c_, c_, k=3, w=w.cv2)
self.cv3 = TFConv(c_, c2, w=w.cv3)
def call(self, inputs):
"""Performs forward pass through TFProto model applying convolutions and upscaling on input tensor."""
return self.cv3(self.cv2(self.upsample(self.cv1(inputs))))
class TFUpsample(keras.layers.Layer):
# TF version of torch.nn.Upsample()
def __init__(self, size, scale_factor, mode, w=None):
"""
Initialize a TensorFlow upsampling layer with specified size, scale factor, and mode.
Args:
size (tuple | None): Desired output size of the upsampled tensor. Ignored if `scale_factor` is specified.
scale_factor (float | None): Multiplier for the spatial size of the input tensor. Must be an even number.
mode (str): Upsampling algorithm. Supported modes include 'nearest' and 'bilinear'.
w (None): Placeholder for weights, included for parameter consistency across layers.
Returns:
(None): This is an initializer method; thus, it does not return any value.
Example:
```python
upsample_layer = TFUpsample(size=None, scale_factor=2, mode='nearest')
input_tensor = tf.random.normal([1, 64, 32, 32]) # Random input tensor of shape (B, C, H, W)
output_tensor = upsample_layer(input_tensor)
```
Note:
The `scale_factor` must be an even number, ensuring proper scaling of tensor dimensions during upsampling.
The default algorithm for upsampling is 'nearest', which utilizes nearest-neighbor interpolation.
"""
super().__init__()
assert scale_factor % 2 == 0, "scale_factor must be multiple of 2"
self.upsample = lambda x: tf.image.resize(x, (x.shape[1] * scale_factor, x.shape[2] * scale_factor), mode)
# self.upsample = keras.layers.UpSampling2D(size=scale_factor, interpolation=mode)
# with default arguments: align_corners=False, half_pixel_centers=False
# self.upsample = lambda x: tf.raw_ops.ResizeNearestNeighbor(images=x,
# size=(x.shape[1] * 2, x.shape[2] * 2))
def call(self, inputs):
"""
Upsamples the input tensor using nearest neighbor interpolation by a specified scale factor.
Args:
inputs (tf.Tensor): Input tensor to be upsampled, of shape (N, H, W, C) where N is the batch size, H and W
are the height and width respectively, and C is the number of channels.
Returns:
(tf.Tensor): Upsampled tensor with dimensions (N, H * scale_factor, W * scale_factor, C).
Example:
```python
upsample_layer = TFUpsample(size=None, scale_factor=2, mode='nearest')
input_tensor = tf.random.normal([1, 64, 32, 32]) # Random input tensor of shape (B, C, H, W)
output_tensor = upsample_layer(input_tensor)
```
Note:
This implementation uses TensorFlow's `tf.image.resize` function to perform the nearest neighbor
upsampling. Ensure that the `scale_factor` is an even number for proper upsampling.
"""
return self.upsample(inputs)
class TFConcat(keras.layers.Layer):
# TF version of torch.concat()
def __init__(self, dimension=1, w=None):
"""
Initializes a TensorFlow layer for concatenating tensors, converting from NCHW to NHWC format.
Args:
dimension (int): Dimension along which to concatenate. Must be 1 for converting NCHW to NHWC.
w (None, optional): Placeholder for weights to maintain consistency with other layers.
Returns:
None
Note:
This layer is designed to handle tensor concatenation specifically for dimensions related to
NCHW (channels-first) to NHWC (channels-last) format conversion.
Example:
```python
concat_layer = TFConcat(dimension=1)
tensor1 = tf.random.normal((1, 32, 224, 224)) # Example tensor in NCHW format
tensor2 = tf.random.normal((1, 32, 224, 224)) # Another example tensor in NCHW format
result = concat_layer([tensor1, tensor2])
```
"""
super().__init__()
assert dimension == 1, "convert only NCHW to NHWC concat"
self.d = 3
def call(self, inputs):
"""
Concatenate input tensors along the last dimension for NCHW to NHWC conversion.
Args:
inputs (list[tf.Tensor]): List of input tensors, each with shape (B, H, W, C) where B is the batch size,
H is height, W is width, and C is the number of channels.
Returns:
(tf.Tensor): Concatenated tensor along the last dimension, maintaining the shape (B, H, W, ∑C_in),
where ∑C_in is the sum of the input channels.
Example:
```python
concat_layer = TFConcat()
tensor1 = tf.random.normal([1, 64, 64, 32])
tensor2 = tf.random.normal([1, 64, 64, 64])
concatenated_tensor = concat_layer([tensor1, tensor2])
```
Note:
This function is designed to convert concatenation from PyTorch's NCHW format to TensorFlow's NHWC format.
Ensure all input tensors have compatible shapes except for the channel dimension.
"""
return tf.concat(inputs, self.d)
def parse_model(d, ch, model, imgsz):
"""
Parse YOLOv5 model configuration for TensorFlow and create its layer structure.
Args:
d (dict): Model configuration dictionary containing backbone and head definitions.
ch (list[int]): List of channel numbers for each layer.
model (object): Instance of the model with weights.
imgsz (tuple[int, int]): Input image size (height, width).
Returns:
(list[keras.Sequential]): List of Keras Sequential models implementing the YOLOv5 architecture as specified by the
given configuration dictionary.
Example:
```python
config = {
"anchors": [[(10, 13), (16, 30), (33, 23)]],
"nc": 80,
"depth_multiple": 0.33,
"width_multiple": 0.50,
"backbone": [
[-1, 1, "Conv", [64, 3, 1, 1]],
[-1, 1, "BottleneckCSP", [128, 1, True]],
[-1, 1, "SPP", [256]],
],
"head": [
[-1, 1, "BottleneckCSP", [128, 1, True]],
[-1, 1, "Detect", [80, [[10, 13], [16, 30], [33, 23]], 256]],
],
}
ch = [3]
model = YourModel() # Substitute with an actual model object
imgsz = (640, 640)
parsed_model = parse_model(config, ch, model, imgsz)
```
Note:
This function dynamically creates the model layers specified by the configuration dictionary, ensuring channel
dimensions, kernel sizes, and other parameters match expected values. It supports various module types like
`Conv`, `Bottleneck`, `SPP`, and more.
"""
LOGGER.info(f"\n{'':>3}{'from':>18}{'n':>3}{'params':>10} {'module':<40}{'arguments':<30}")
anchors, nc, gd, gw, ch_mul = (
d["anchors"],
d["nc"],
d["depth_multiple"],
d["width_multiple"],
d.get("channel_multiple"),
)
na = (len(anchors[0]) // 2) if isinstance(anchors, list) else anchors # number of anchors
no = na * (nc + 5) # number of outputs = anchors * (classes + 5)
if not ch_mul:
ch_mul = 8
layers, save, c2 = [], [], ch[-1] # layers, savelist, ch out
for i, (f, n, m, args) in enumerate(d["backbone"] + d["head"]): # from, number, module, args
m_str = m
m = eval(m) if isinstance(m, str) else m # eval strings
for j, a in enumerate(args):
try:
args[j] = eval(a) if isinstance(a, str) else a # eval strings
except NameError:
pass
n = max(round(n * gd), 1) if n > 1 else n # depth gain
if m in [
nn.Conv2d,
Conv,
DWConv,
DWConvTranspose2d,
Bottleneck,
SPP,
SPPF,
MixConv2d,
Focus,
CrossConv,
BottleneckCSP,
C3,
C3x,
]:
c1, c2 = ch[f], args[0]
c2 = make_divisible(c2 * gw, ch_mul) if c2 != no else c2
args = [c1, c2, *args[1:]]
if m in [BottleneckCSP, C3, C3x]:
args.insert(2, n)
n = 1
elif m is nn.BatchNorm2d:
args = [ch[f]]
elif m is Concat:
c2 = sum(ch[-1 if x == -1 else x + 1] for x in f)
elif m in [Detect, Segment]:
args.append([ch[x + 1] for x in f])
if isinstance(args[1], int): # number of anchors
args[1] = [list(range(args[1] * 2))] * len(f)
if m is Segment:
args[3] = make_divisible(args[3] * gw, ch_mul)
args.append(imgsz)
else:
c2 = ch[f]
tf_m = eval("TF" + m_str.replace("nn.", ""))
m_ = (
keras.Sequential([tf_m(*args, w=model.model[i][j]) for j in range(n)])
if n > 1
else tf_m(*args, w=model.model[i])
) # module
torch_m_ = nn.Sequential(*(m(*args) for _ in range(n))) if n > 1 else m(*args) # module
t = str(m)[8:-2].replace("__main__.", "") # module type
np = sum(x.numel() for x in torch_m_.parameters()) # number params
m_.i, m_.f, m_.type, m_.np = i, f, t, np # attach index, 'from' index, type, number params
LOGGER.info(f"{i:>3}{str(f):>18}{str(n):>3}{np:>10} {t:<40}{str(args):<30}") # print
save.extend(x % i for x in ([f] if isinstance(f, int) else f) if x != -1) # append to savelist
layers.append(m_)
ch.append(c2)
return keras.Sequential(layers), sorted(save)
class TFModel:
# TF YOLOv5 model
def __init__(self, cfg="yolov5s.yaml", ch=3, nc=None, model=None, imgsz=(640, 640)):
"""
Initialize the TensorFlow YOLOv5 model instance based on given configuration, channel count, and optional pre-
trained weights.
Args:
cfg (str | dict): Path to the YOLOv5 configuration file (YAML) or a dictionary containing the model definition.
ch (int): Number of input channels, usually 3 for RGB images.
nc (int | None): Number of classes for detection. If provided, this overrides the `nc` value in the configuration.
model (torch.nn.Module | None): Pre-trained PyTorch model instance to convert its weights for TensorFlow model.
imgsz (tuple[int, int]): Image dimensions specified as (height, width).
Returns:
(None)
Example:
```python
tf_model = TFModel(cfg='models/yolov5s.yaml', ch=3, nc=80, model=pytorch_model_instance, imgsz=(640, 640))
```
Note:
Ensure that the provided model configuration file or dictionary conforms to the expected YOLOv5 format for seamless
loading and parsing. Refer to https://github.com/ultralytics/yolov5 for additional details and model configurations.
"""
super().__init__()
if isinstance(cfg, dict):
self.yaml = cfg # model dict
else: # is *.yaml
import yaml # for torch hub
self.yaml_file = Path(cfg).name
with open(cfg) as f:
self.yaml = yaml.load(f, Loader=yaml.FullLoader) # model dict
# Define model
if nc and nc != self.yaml["nc"]:
LOGGER.info(f"Overriding {cfg} nc={self.yaml['nc']} with nc={nc}")
self.yaml["nc"] = nc # override yaml value
self.model, self.savelist = parse_model(deepcopy(self.yaml), ch=[ch], model=model, imgsz=imgsz)
def predict(
self,
inputs,
tf_nms=False,
agnostic_nms=False,
topk_per_class=100,
topk_all=100,
iou_thres=0.45,
conf_thres=0.25,
):
"""
Predict bounding boxes and class scores for given input data using YOLOv5 model layers with optional TensorFlow
NMS.
Args:
inputs (tf.Tensor): Input tensor containing the image data for object detection.
tf_nms (bool): If True, apply TensorFlow NMS (Non-Maximum Suppression) to filter overlapping bounding boxes.
Default is False.
agnostic_nms (bool): If True, apply class-agnostic NMS. Default is False.
topk_per_class (int): The maximum number of boxes retained per class after NMS. Default is 100.
topk_all (int): The maximum number of boxes retained across all classes after NMS. Default is 100.
iou_thres (float): Intersection-over-Union (IoU) threshold for filtering overlapping boxes in NMS. Default is 0.45.
conf_thres (float): Confidence threshold for filtering low-confidence predictions. Default is 0.25.
Returns:
(tuple[tf.Tensor]): Predicted bounding boxes and class scores, optionally filtered by TensorFlow NMS.
If `tf_nms` is True, the returned tuple will include NMS filtered boxes.
Example:
```python
import tensorflow as tf
model = TFModel(cfg='yolov5s.yaml', model=pretrained_model, imgsz=(640, 640))
# Generate dummy input data
inputs = tf.random.normal((1, 640, 640, 3))
# Run prediction
outputs = model.predict(inputs, tf_nms=True)
```
Note:
The function supports both standard YOLOv5 inference and optional TensorFlow NMS for filtering predictions.
Ensure input tensor dimensions align with model expectations to avoid runtime errors.
"""
y = [] # outputs
x = inputs
for m in self.model.layers:
if m.f != -1: # if not from previous layer
x = y[m.f] if isinstance(m.f, int) else [x if j == -1 else y[j] for j in m.f] # from earlier layers
x = m(x) # run
y.append(x if m.i in self.savelist else None) # save output
# Add TensorFlow NMS
if tf_nms:
boxes = self._xywh2xyxy(x[0][..., :4])
probs = x[0][:, :, 4:5]
classes = x[0][:, :, 5:]
scores = probs * classes
if agnostic_nms:
nms = AgnosticNMS()((boxes, classes, scores), topk_all, iou_thres, conf_thres)
else:
boxes = tf.expand_dims(boxes, 2)
nms = tf.image.combined_non_max_suppression(
boxes, scores, topk_per_class, topk_all, iou_thres, conf_thres, clip_boxes=False
)
return (nms,)
return x # output [1,6300,85] = [xywh, conf, class0, class1, ...]
# x = x[0] # [x(1,6300,85), ...] to x(6300,85)
# xywh = x[..., :4] # x(6300,4) boxes
# conf = x[..., 4:5] # x(6300,1) confidences
# cls = tf.reshape(tf.cast(tf.argmax(x[..., 5:], axis=1), tf.float32), (-1, 1)) # x(6300,1) classes
# return tf.concat([conf, cls, xywh], 1)
@staticmethod
def _xywh2xyxy(xywh):
"""
Converts bounding box format from [x, y, w, h] to [x1, y1, x2, y2].
Args:
xywh (tf.Tensor): Bounding boxes in [x, y, w, h] format with shape (N, 4),
where N is the number of bounding boxes.
Returns:
(tf.Tensor): Bounding boxes in [x1, y1, x2, y2] format with shape (N, 4), corresponding to top-left
(x1, y1) and bottom-right (x2, y2) coordinates.
Example:
```python
boxes_xywh = tf.constant([[100, 150, 200, 250]], dtype=tf.float32)
boxes_xyxy = TFModel._xywh2xyxy(boxes_xywh)
print(boxes_xyxy)
# Output: tf.Tensor([[0., 0., 1., 1.]], shape=(1, 4), dtype=float32)
```
Note:
This function is primarily used internally for post-processing the output of model predictions.
"""
x, y, w, h = tf.split(xywh, num_or_size_splits=4, axis=-1)
return tf.concat([x - w / 2, y - h / 2, x + w / 2, y + h / 2], axis=-1)
class AgnosticNMS(keras.layers.Layer):
# TF Agnostic NMS
def call(self, input, topk_all, iou_thres, conf_thres):
"""
Perform class-agnostic non-maximum suppression (NMS) on predicted bounding boxes.
Args:
inputs (tuple[torch.Tensor, torch.Tensor, torch.Tensor]): A tuple consisting of:
- boxes (torch.Tensor): Tensor containing bounding box coordinates with shape (N, 4) for (x1, y1, x2, y2).
- scores (torch.Tensor): Tensor containing class scores for each bounding box with shape (N, num_classes).
- classes (torch.Tensor): Tensor containing class indices for each bounding box with shape (N,).
max_detections (int): Maximum number of detections to keep after NMS.
iou_threshold (float): IoU threshold for determining whether to suppress a bounding box.
score_threshold (float): Minimum score for a box to be considered a valid detection.
Returns:
(list[torch.Tensor]): List of tensors post NMS processing, consisting of:
- boxes (torch.Tensor): Tensor with shape (M, 4), where M≤max_detections after NMS.
- scores (torch.Tensor): Tensor with shape (M,) containing the scores of the remaining boxes.
- classes (torch.Tensor): Tensor with shape (M,) containing the class indices of the remaining boxes.
Example:
```python
boxes = torch.tensor([[100, 100, 200, 200], [110, 110, 210, 210], [300, 300, 400, 400]])
scores = torch.tensor([0.9, 0.75, 0.8])
classes = torch.tensor([1, 1, 2])
nms = AgnosticNMS()
kept_boxes, kept_scores, kept_classes = nms(boxes, scores, classes, max_detections=2, iou_threshold=0.5, score_threshold=0.3)
```
Note:
The function performs class-agnostic NMS, meaning that it treats all classes as a single class for the purpose of
suppression, leading to a more aggressive filtering of overlapping boxes across different classes.
"""
return tf.map_fn(
lambda x: self._nms(x, topk_all, iou_thres, conf_thres),
input,
fn_output_signature=(tf.float32, tf.float32, tf.float32, tf.int32),
name="agnostic_nms",
)
@staticmethod
def _nms(x, topk_all=100, iou_thres=0.45, conf_thres=0.25):
"""
Perform non-maximum suppression (NMS) on detected objects using class-agnosticity.
Args:
x (tuple[torch.Tensor, torch.Tensor, torch.Tensor]): A tuple containing:
- boxes (torch.Tensor): Bounding boxes with shape (N, 4), where N is the number of boxes.
- classes (torch.Tensor): Class scores with shape (N, C), where C is the number of classes.
- scores (torch.Tensor): Detection scores with shape (N, 1).
topk_all (int): Maximum number of detections to keep after NMS.
iou_thres (float): Intersection over Union (IoU) threshold for NMS.
conf_thres (float): Confidence score threshold for filtering predictions.
Returns:
(tuple[torch.Tensor, torch.Tensor, torch.Tensor, int]): A tuple containing:
- padded_boxes (torch.Tensor): Padded bounding boxes with shape (topk_all, 4).
- padded_scores (torch.Tensor): Padded scores with shape (topk_all,).
- padded_classes (torch.Tensor): Padded class indices with shape (topk_all,).
- valid_detections (int): Number of valid detections after NMS.
Example:
```python
boxes = torch.tensor([[0, 0, 10, 10], [0, 0, 8, 8]], dtype=torch.float32)
scores = torch.tensor([0.9, 0.8], dtype=torch.float32)
classes = torch.tensor([[1, 0], [1, 0]], dtype=torch.float32)
results = AgnosticNMS._nms((boxes, classes, scores), topk_all=10, iou_thres=0.5, conf_thres=0.3)
print(results)
```
Note:
This method computes agnostic NMS, meaning classes are not considered in the suppression process. Use defined IoU and
confidence thresholds to adjust filtering as needed.
"""
boxes, classes, scores = x
class_inds = tf.cast(tf.argmax(classes, axis=-1), tf.float32)
scores_inp = tf.reduce_max(scores, -1)
selected_inds = tf.image.non_max_suppression(
boxes, scores_inp, max_output_size=topk_all, iou_threshold=iou_thres, score_threshold=conf_thres
)
selected_boxes = tf.gather(boxes, selected_inds)
padded_boxes = tf.pad(
selected_boxes,
paddings=[[0, topk_all - tf.shape(selected_boxes)[0]], [0, 0]],
mode="CONSTANT",
constant_values=0.0,
)
selected_scores = tf.gather(scores_inp, selected_inds)
padded_scores = tf.pad(
selected_scores,
paddings=[[0, topk_all - tf.shape(selected_boxes)[0]]],
mode="CONSTANT",
constant_values=-1.0,
)
selected_classes = tf.gather(class_inds, selected_inds)
padded_classes = tf.pad(
selected_classes,
paddings=[[0, topk_all - tf.shape(selected_boxes)[0]]],
mode="CONSTANT",
constant_values=-1.0,
)
valid_detections = tf.shape(selected_inds)[0]
return padded_boxes, padded_scores, padded_classes, valid_detections
def activations(act=nn.SiLU):
"""
Provide an equivalent TensorFlow function/method when translating PyTorch code to TensorFlow.
Args:
act (torch.nn.Module): PyTorch activation module. Valid options include nn.LeakyReLU, nn.Hardswish, nn.SiLU, and
custom SiLU from utils.activations.
Returns:
(tf.function): TensorFlow equivalent activation function based on the provided PyTorch activation module.
Examples:
```python
from tensorflow import keras
# Using LeakyReLU activation
activation_tf = activations(nn.LeakyReLU())
output = activation_tf(input_tensor)
# Using Hardswish activation
activation_tf = activations(nn.Hardswish())
output = activation_tf(input_tensor)
```
Note:
This mapping utility ensures compatibility when converting models across TensorFlow and PyTorch frameworks,
allowing the use of equivalent activation functions based on the original PyTorch model configuration.
"""
if isinstance(act, nn.LeakyReLU):
return lambda x: keras.activations.relu(x, alpha=0.1)
elif isinstance(act, nn.Hardswish):
return lambda x: x * tf.nn.relu6(x + 3) * 0.166666667
elif isinstance(act, (nn.SiLU, SiLU)):
return lambda x: keras.activations.swish(x)
else:
raise Exception(f"no matching TensorFlow activation found for PyTorch activation {act}")
def representative_dataset_gen(dataset, ncalib=100):
"""
Generate a representative dataset for TensorFlow Lite model calibration by yielding input tensors.
Args:
dataset (object): The dataset object providing image paths, images, original images, video captures, and
strings. The dataset should support iteration yielding tuples (path, img, im0s, vid_cap, string).
ncalib (int): The number of calibration samples to generate. Default is 100.
Returns:
generator: A generator yielding a list containing a single tensor with shape (1, H, W, C) per dataset sample.
Example:
```python
dataset = CustomDataset(...) # Custom dataset
representative_data = representative_dataset_gen(dataset, ncalib=50)
for data in representative_data:
print(data)
```
Notes:
- The function expects input images to be in (C, H, W) format; it then converts them to (H, W, C) and normalizes.
- Ensure that dataset supports iteration and typical indexing to avoid iteration-related issues.
- Refer to TensorFlow Lite documentation for details on the role of representative datasets during model
quantization: https://www.tensorflow.org/lite/performance/post_training_quantization#full_integer_quantization.
"""
for n, (path, img, im0s, vid_cap, string) in enumerate(dataset):
im = np.transpose(img, [1, 2, 0])
im = np.expand_dims(im, axis=0).astype(np.float32)
im /= 255
yield [im]
if n >= ncalib:
break
def run(
weights=ROOT / "yolov5s.pt", # weights path
imgsz=(640, 640), # inference size h,w
batch_size=1, # batch size
dynamic=False, # dynamic batch size
):
"""
Exports YOLOv5 model from PyTorch to TensorFlow/Keras formats and performs inference for validation.
Args:
weights (str | Path): Path to the pre-trained YOLOv5 weights file (typically a .pt file).
imgsz (tuple[int, int]): Tuple specifying the inference size (height, width) of the input images.
batch_size (int): Number of images to process in a batch.
dynamic (bool): Specifies dynamic batch size when set to True.
Returns:
None: The function does not return any value. It displays model summaries and performs inference.
Example:
```python
run(weights="yolov5s.pt", imgsz=(640, 640), batch_size=1, dynamic=False)
```
Note:
- Ensure that the specified weight file path points to a valid YOLOv5 weight file, and the weights are compatible
with the model configuration.
- The function will load the PyTorch model, convert it to TensorFlow/Keras formats, and display the model
summaries. It will also perform a dummy inference to validate the export.
"""
# PyTorch model
im = torch.zeros((batch_size, 3, *imgsz)) # BCHW image
model = attempt_load(weights, device=torch.device("cpu"), inplace=True, fuse=False)
_ = model(im) # inference
model.info()
# TensorFlow model
im = tf.zeros((batch_size, *imgsz, 3)) # BHWC image
tf_model = TFModel(cfg=model.yaml, model=model, nc=model.nc, imgsz=imgsz)
_ = tf_model.predict(im) # inference
# Keras model
im = keras.Input(shape=(*imgsz, 3), batch_size=None if dynamic else batch_size)
keras_model = keras.Model(inputs=im, outputs=tf_model.predict(im))
keras_model.summary()
LOGGER.info("PyTorch, TensorFlow and Keras models successfully verified.\nUse export.py for TF model export.")
def parse_opt():
"""
Parses and returns command-line options for model inference.
This function sets up an argument parser for command-line options pertinent to performing model inference, including
the weights path, image size, batch size, and dynamic batching.
Args:
None
Returns:
(argparse.Namespace): Parsed arguments as namespace with attributes 'weights' (str), 'imgsz' (list[int]),
'batch_size' (int), and 'dynamic' (bool).
Example:
```python
opts = parse_opt()
print(opts.weights) # Outputs the path to the weights file
```
See Also:
- https://github.com/ultralytics/ultralytics for repository context and additional details.
- `argparse` documentation: https://docs.python.org/3/library/argparse.html for more about argument parsing.
"""
parser = argparse.ArgumentParser()
parser.add_argument("--weights", type=str, default=ROOT / "yolov5s.pt", help="weights path")
parser.add_argument("--imgsz", "--img", "--img-size", nargs="+", type=int, default=[640], help="inference size h,w")
parser.add_argument("--batch-size", type=int, default=1, help="batch size")
parser.add_argument("--dynamic", action="store_true", help="dynamic batch size")
opt = parser.parse_args()
opt.imgsz *= 2 if len(opt.imgsz) == 1 else 1 # expand
print_args(vars(opt))
return opt
def main(opt):
"""
Main entry point for execution of TensorFlow YOLOv5 model conversion script.
Args:
opt (argparse.Namespace): Parsed command line options, including weights path, image size, batch size, and dynamic
batch size flag.
Returns:
None
"""
run(**vars(opt))
if __name__ == "__main__":
opt = parse_opt()
main(opt)