Add dataset transforms (#1)

pull/5/head
HinGwenWoong 2022-09-18 10:36:41 +08:00 committed by GitHub
parent cfec074cf6
commit e4359a15a3
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 605 additions and 0 deletions

View File

@ -0,0 +1,8 @@
# Copyright (c) OpenMMLab. All rights reserved.
from .transforms import (LetterResize, LoadAnnotations, YOLOv5HSVRandomAug,
YOLOv5KeepRatioResize, YOLOv5RandomAffine)
__all__ = [
'YOLOv5KeepRatioResize', 'LetterResize', 'YOLOv5HSVRandomAug', 'LoadAnnotations',
'YOLOv5RandomAffine'
]

View File

@ -0,0 +1,597 @@
# Copyright (c) OpenMMLab. All rights reserved.
import math
from typing import Tuple, Union
import cv2
import mmcv
import numpy as np
import torch
from mmcv.transforms import BaseTransform
from mmcv.transforms.utils import cache_randomness
from numpy import random
from mmdet.datasets.transforms import LoadAnnotations as MMDET_LoadAnnotations
from mmdet.datasets.transforms import Resize as MMDET_Resize
from mmdet.structures.bbox import autocast_box_type, get_box_type
from mmyolo.registry import TRANSFORMS
@TRANSFORMS.register_module()
class YOLOv5KeepRatioResize(MMDET_Resize):
"""Resize images & bbox(if existed).
This transform resizes the input image according to ``scale``.
Bboxes (if existed) are then resized with the same scale factor.
Required Keys:
- img (np.uint8)
- gt_bboxes (BaseBoxes[torch.float32]) (optional)
Modified Keys:
- img (np.uint8)
- img_shape (tuple)
- gt_bboxes (optional)
- scale (float)
Added Keys:
- scale_factor (np.float32)
Args:
scale (Union[int, Tuple[int, int]]): Images scales for resizing.
"""
def __init__(self,
scale: Union[int, Tuple[int, int]],
keep_ratio: bool = True,
**kwargs) -> None:
assert keep_ratio is True
super().__init__(scale=scale, keep_ratio=True, **kwargs)
@staticmethod
def _get_rescale_ratio(old_size, scale) -> float:
"""Calculate the rescale ratio.
Args:
old_size (tuple[int]): The old size (w, h) of image.
scale (float | tuple[int]): The scaling factor or maximum size.
If it is a float number, then the image will be rescaled by
this factor, else if it is a tuple of 2 integers, then
the image will be rescaled as large as possible within
the scale.
Returns:
float: The resize ratio.
"""
w, h = old_size
if isinstance(scale, (float, int)):
if scale <= 0:
raise ValueError(f'Invalid scale {scale}, must be positive.')
scale_factor = scale
elif isinstance(scale, tuple):
max_long_edge = max(scale)
max_short_edge = min(scale)
scale_factor = min(max_long_edge / max(h, w),
max_short_edge / min(h, w))
else:
raise TypeError('Scale must be a number or tuple of int, '
f'but got {type(scale)}')
return scale_factor
def _resize_img(self, results: dict) -> None:
"""Resize images with ``results['scale']``."""
assert self.keep_ratio is True
if results.get('img', None) is not None:
image = results['img']
original_h, original_w = image.shape[:2]
ratio = self._get_rescale_ratio((original_h, original_w),
self.scale)
if ratio != 1:
# resize image according to the ratio
image = mmcv.imrescale(
img=image,
scale=ratio,
interpolation='area' if ratio < 1 else 'bilinear',
backend=self.backend)
resized_h, resized_w = image.shape[:2]
scale_ratio = resized_h / original_h
scale_factor = np.array([scale_ratio, scale_ratio],
dtype=np.float32)
results['img'] = image
results['img_shape'] = image.shape[:2]
results['scale_factor'] = scale_factor
@TRANSFORMS.register_module()
class LetterResize(MMDET_Resize):
"""Resize and pad image while meeting stride-multiple constraints.
Required Keys:
- img (np.uint8)
- batch_shape (np.int64) (optional)
Modified Keys:
- img (np.uint8)
- img_shape (tuple)
- gt_bboxes (optional)
Added Keys:
- pad_param (np.float32)
Args:
scale (Union[int, Tuple[int, int]]): Images scales for resizing.
pad_val (dict): Padding value. Defaults to dict(img=0, seg=255).
use_mini_pad (bool): Whether using minimum rectangle padding.
Defaults to True
stretch_only (bool): Whether stretch to the specified size directly.
Defaults to False
allow_scale_up (bool): Allow scale up when ratio > 1. Defaults to True
"""
def __init__(self,
scale: Union[int, Tuple[int, int]],
pad_val: dict = dict(img=0, mask=0, seg=255),
use_mini_pad: bool = False,
stretch_only: bool = False,
allow_scale_up: bool = True,
**kwargs) -> None:
super().__init__(scale=scale, keep_ratio=True, **kwargs)
self.pad_val = pad_val
if isinstance(pad_val, (int, float)):
pad_val = dict(img=pad_val, seg=255)
assert isinstance(
pad_val, dict), f'pad_val must be dict, but got {type(pad_val)}'
self.use_mini_pad = use_mini_pad
self.stretch_only = stretch_only
self.allow_scale_up = allow_scale_up
self.padded_val = None
def _resize_img(self, results: dict) -> None:
"""Resize images with ``results['scale']``."""
image = results.get('img', None)
if image is None:
return
# Use batch_shape if a batch_shape policy is configured
if 'batch_shape' in results:
self.scale = tuple(results['batch_shape'])
image_shape = image.shape[:2] # height, width
# Scale ratio (new / old)
ratio = min(self.scale[0] / image_shape[0],
self.scale[1] / image_shape[1])
# only scale down, do not scale up (for better test mAP)
if not self.allow_scale_up:
ratio = min(ratio, 1.0)
ratio = [ratio, ratio] # float -> (float, float) for (height, width)
# compute the best size of the image
no_pad_shape = (int(round(image_shape[0] * ratio[0])),
int(round(image_shape[1] * ratio[1])))
# padding height & width
padding_h, padding_w = [
self.scale[0] - no_pad_shape[0], self.scale[1] - no_pad_shape[1]
]
if self.use_mini_pad:
# minimum rectangle padding
padding_w, padding_h = np.mod(padding_w, 32), np.mod(padding_h, 32)
elif self.stretch_only:
# stretch to the specified size directly
padding_h, padding_w = 0.0, 0.0
no_pad_shape = (self.scale[0], self.scale[1])
ratio = [
self.scale[0] / image_shape[0], self.scale[1] / image_shape[1]
] # height, width ratios
# divide padding into 2 sides
padding_h /= 2
padding_w /= 2
if image_shape[::-1] != no_pad_shape:
# compare with no resize and padding size
image = mmcv.imrescale(
image,
no_pad_shape,
interpolation=self.interpolation,
backend=self.backend)
scale_factor = np.array([ratio[0], ratio[1]], dtype=np.float32)
if 'scale_factor' in results:
results['scale_factor'] = results['scale_factor'] * scale_factor
else:
results['scale_factor'] = scale_factor
# padding
top_padding, bottom_padding = int(round(padding_h - 0.1)), int(
round(padding_h + 0.1))
left_padding, right_padding = int(round(padding_w - 0.1)), int(
round(padding_w + 0.1))
padding_list = [
top_padding, bottom_padding, left_padding, right_padding
]
if top_padding != 0 or bottom_padding != 0 or \
left_padding != 0 or right_padding != 0:
pad_val = self.pad_val.get('img', 0)
if isinstance(pad_val, int) and image.ndim == 3:
self.padded_val = tuple(pad_val for _ in range(image.shape[2]))
else:
self.padded_val = pad_val
image = mmcv.impad(
img=image,
padding=(padding_list[2], padding_list[0], padding_list[3],
padding_list[1]),
pad_val=self.padded_val,
padding_mode='constant')
results['img'] = image
results['img_shape'] = image.shape
results['pad_param'] = np.array(padding_list, dtype=np.float32)
def _resize_masks(self, results: dict) -> None:
"""Resize masks with ``results['scale']``"""
if results.get('gt_masks', None) is None:
return
# resize the gt_masks
gt_mask_height = results['gt_masks'].height * \
results['scale_factor'][0]
gt_mask_width = results['gt_masks'].width * \
results['scale_factor'][1]
gt_masks = results['gt_masks'].rescale((gt_mask_height, gt_mask_width))
# padding the gt_masks
if len(gt_masks) == 0:
padded_masks = np.empty((0, *results['img_shape'][:2]),
dtype=np.uint8)
else:
# TODO: The function is incorrect. Because the mask may not
# be able to pad.
padded_masks = np.stack([
mmcv.impad(
mask,
padding=(int(results['pad_param'][2]),
int(results['pad_param'][0]),
int(results['pad_param'][3]),
int(results['pad_param'][1])),
pad_val=self.pad_val.get('masks', 0)) for mask in gt_masks
])
results['gt_masks'] = type(results['gt_masks'])(
padded_masks, *results['img_shape'][:2])
def _resize_bboxes(self, results: dict) -> None:
"""Resize bounding boxes with ``results['scale_factor']``."""
if results.get('gt_bboxes', None) is None:
return
results['gt_bboxes'].rescale_(results['scale_factor'])
if len(results['pad_param']) != 4:
return
results['gt_bboxes'].translate_(
(results['pad_param'][2], results['pad_param'][1]))
if self.clip_object_border:
results['gt_bboxes'].clip_(results['img_shape'])
# TODO: Check if it can be merged with mmdet.YOLOXHSVRandomAug
@TRANSFORMS.register_module()
class YOLOv5HSVRandomAug(BaseTransform):
"""Apply HSV augmentation to image sequentially.
Required Keys:
- img
Modified Keys:
- img
Args:
hue_delta ([int, float]): delta of hue. Defaults to 0.015.
saturation_delta ([int, float]): delta of saturation. Defaults to 0.7.
value_delta ([int, float]): delta of value. Defaults to 0.4.
"""
def __init__(self,
hue_delta: Union[int, float] = 0.015,
saturation_delta: Union[int, float] = 0.7,
value_delta: Union[int, float] = 0.4):
self.hue_delta = hue_delta
self.saturation_delta = saturation_delta
self.value_delta = value_delta
def transform(self, results: dict) -> dict:
hsv_gains = \
random.uniform(-1, 1, 3) * \
[self.hue_delta, self.saturation_delta, self.value_delta] + 1
hue, sat, val = cv2.split(
cv2.cvtColor(results['img'], cv2.COLOR_BGR2HSV))
table_list = np.arange(0, 256, dtype=hsv_gains.dtype)
lut_hue = ((table_list * hsv_gains[0]) % 180).astype(np.uint8)
lut_sat = np.clip(table_list * hsv_gains[1], 0, 255).astype(np.uint8)
lut_val = np.clip(table_list * hsv_gains[2], 0, 255).astype(np.uint8)
im_hsv = cv2.merge(
(cv2.LUT(hue, lut_hue), cv2.LUT(sat,
lut_sat), cv2.LUT(val, lut_val)))
results['img'] = cv2.cvtColor(im_hsv, cv2.COLOR_HSV2BGR)
return results
# TODO: can be accelerated
@TRANSFORMS.register_module()
class LoadAnnotations(MMDET_LoadAnnotations):
"""Because the yolo series does not need to consider ignore bboxes for the
time being, in order to speed up the pipeline, it can be excluded in
advance."""
def _load_bboxes(self, results: dict) -> None:
"""Private function to load bounding box annotations.
Note: BBoxes with ignore_flag of 1 is not considered.
Args:
results (dict): Result dict from :obj:``mmengine.BaseDataset``.
Returns:
dict: The dict contains loaded bounding box annotations.
"""
gt_bboxes = []
gt_ignore_flags = []
for instance in results['instances']:
if instance['ignore_flag'] == 0:
gt_bboxes.append(instance['bbox'])
gt_ignore_flags.append(instance['ignore_flag'])
results['gt_ignore_flags'] = np.array(gt_ignore_flags, dtype=bool)
if self.box_type is None:
results['gt_bboxes'] = np.array(
gt_bboxes, dtype=np.float32).reshape((-1, 4))
else:
_, box_type_cls = get_box_type(self.box_type)
results['gt_bboxes'] = box_type_cls(gt_bboxes, dtype=torch.float32)
def _load_labels(self, results: dict) -> None:
"""Private function to load label annotations.
Note: BBoxes with ignore_flag of 1 is not considered.
Args:
results (dict): Result dict from :obj:``mmengine.BaseDataset``.
Returns:
dict: The dict contains loaded label annotations.
"""
gt_bboxes_labels = []
for instance in results['instances']:
if instance['ignore_flag'] == 0:
gt_bboxes_labels.append(instance['bbox_label'])
results['gt_bboxes_labels'] = np.array(
gt_bboxes_labels, dtype=np.int64)
@TRANSFORMS.register_module()
class YOLOv5RandomAffine(BaseTransform):
"""Random affine transform data augmentation in YOLOv5. It is different
from the implementation in YOLOX.
This operation randomly generates affine transform matrix which including
rotation, translation, shear and scaling transforms.
Required Keys:
- img
- gt_bboxes (BaseBoxes[torch.float32]) (optional)
- gt_bboxes_labels (np.int64) (optional)
- gt_ignore_flags (np.bool) (optional)
Modified Keys:
- img
- img_shape
- gt_bboxes (optional)
- gt_bboxes_labels (optional)
- gt_ignore_flags (optional)
Args:
max_rotate_degree (float): Maximum degrees of rotation transform.
Defaults to 10.
max_translate_ratio (float): Maximum ratio of translation.
Defaults to 0.1.
scaling_ratio_range (tuple[float]): Min and max ratio of
scaling transform. Defaults to (0.5, 1.5).
max_shear_degree (float): Maximum degrees of shear
transform. Defaults to 2.
border (tuple[int]): Distance from height and width sides of input
image to adjust output shape. Only used in mosaic dataset.
Defaults to (0, 0).
border_val (tuple[int]): Border padding values of 3 channels.
Defaults to (114, 114, 114).
bbox_clip_border (bool, optional): Whether to clip the objects outside
the border of the image. In some dataset like MOT17, the gt bboxes
are allowed to cross the border of images. Therefore, we don't
need to clip the gt bboxes in these cases. Defaults to True.
"""
def __init__(self,
max_rotate_degree: float = 10.0,
max_translate_ratio: float = 0.1,
scaling_ratio_range: Tuple[float, float] = (0.5, 1.5),
max_shear_degree: float = 2.0,
border: Tuple[int, int] = (0, 0),
border_val: Tuple[int, int, int] = (114, 114, 114),
bbox_clip_border: bool = True,
min_bbox_size=2,
min_area_ratio=0.1,
max_aspect_ratio=20) -> None:
assert 0 <= max_translate_ratio <= 1
assert scaling_ratio_range[0] <= scaling_ratio_range[1]
assert scaling_ratio_range[0] > 0
self.max_rotate_degree = max_rotate_degree
self.max_translate_ratio = max_translate_ratio
self.scaling_ratio_range = scaling_ratio_range
self.max_shear_degree = max_shear_degree
self.border = border
self.border_val = border_val
self.bbox_clip_border = bbox_clip_border
self.min_bbox_size = min_bbox_size
self.min_bbox_size = min_bbox_size
self.min_area_ratio = min_area_ratio
self.max_aspect_ratio = max_aspect_ratio
@cache_randomness
def _get_random_homography_matrix(self, height, width):
# Rotation
rotation_degree = random.uniform(-self.max_rotate_degree,
self.max_rotate_degree)
rotation_matrix = self._get_rotation_matrix(rotation_degree)
# Scaling
scaling_ratio = random.uniform(self.scaling_ratio_range[0],
self.scaling_ratio_range[1])
scaling_matrix = self._get_scaling_matrix(scaling_ratio)
# Shear
x_degree = random.uniform(-self.max_shear_degree,
self.max_shear_degree)
y_degree = random.uniform(-self.max_shear_degree,
self.max_shear_degree)
shear_matrix = self._get_shear_matrix(x_degree, y_degree)
# Translation
trans_x = random.uniform(0.5 - self.max_translate_ratio,
0.5 + self.max_translate_ratio) * width
trans_y = random.uniform(0.5 - self.max_translate_ratio,
0.5 + self.max_translate_ratio) * height
translate_matrix = self._get_translation_matrix(trans_x, trans_y)
warp_matrix = (
translate_matrix @ shear_matrix @ rotation_matrix @ scaling_matrix)
return warp_matrix, scaling_ratio
@autocast_box_type()
def transform(self, results: dict) -> dict:
img = results['img']
height = img.shape[0] + self.border[0] * 2
width = img.shape[1] + self.border[1] * 2
# Note: Different from YOLOX
center_matrix = np.eye(3, dtype=np.float32)
center_matrix[0, 2] = -img.shape[1] / 2
center_matrix[1, 2] = -img.shape[0] / 2
warp_matrix, scaling_ratio = self._get_random_homography_matrix(
height, width)
warp_matrix = warp_matrix @ center_matrix
img = cv2.warpPerspective(
img,
warp_matrix,
dsize=(width, height),
borderValue=self.border_val)
results['img'] = img
results['img_shape'] = img.shape
bboxes = results['gt_bboxes']
num_bboxes = len(bboxes)
if num_bboxes:
orig_bboxes = bboxes.clone()
bboxes.project_(warp_matrix)
if self.bbox_clip_border:
bboxes.clip_([height, width])
# filter bboxes
orig_bboxes.rescale_([scaling_ratio, scaling_ratio])
# Be careful: valid_index must convert to numpy,
# otherwise it will raise out of bounds when len(valid_index)=1
valid_index = self.filter_gt_bboxes(orig_bboxes, bboxes).numpy()
results['gt_bboxes'] = bboxes[valid_index]
results['gt_bboxes_labels'] = results['gt_bboxes_labels'][
valid_index]
results['gt_ignore_flags'] = results['gt_ignore_flags'][
valid_index]
if 'gt_masks' in results:
raise NotImplementedError('RandomAffine only supports bbox.')
return results
def filter_gt_bboxes(self, origin_bboxes, wrapped_bboxes):
origin_w = origin_bboxes.widths
origin_h = origin_bboxes.heights
wrapped_w = wrapped_bboxes.widths
wrapped_h = wrapped_bboxes.heights
aspect_ratio = np.maximum(wrapped_w / (wrapped_h + 1e-16),
wrapped_h / (wrapped_w + 1e-16))
wh_valid_idx = (wrapped_w > self.min_bbox_size) & \
(wrapped_h > self.min_bbox_size)
area_valid_idx = wrapped_w * wrapped_h / (origin_w * origin_h +
1e-16) > self.min_area_ratio
aspect_ratio_valid_idx = aspect_ratio < self.max_aspect_ratio
return wh_valid_idx & area_valid_idx & aspect_ratio_valid_idx
def __repr__(self):
repr_str = self.__class__.__name__
repr_str += f'(max_rotate_degree={self.max_rotate_degree}, '
repr_str += f'max_translate_ratio={self.max_translate_ratio}, '
repr_str += f'scaling_ratio_range={self.scaling_ratio_range}, '
repr_str += f'max_shear_degree={self.max_shear_degree}, '
repr_str += f'border={self.border}, '
repr_str += f'border_val={self.border_val}, '
repr_str += f'bbox_clip_border={self.bbox_clip_border})'
return repr_str
@staticmethod
def _get_rotation_matrix(rotate_degrees: float) -> np.ndarray:
radian = math.radians(rotate_degrees)
rotation_matrix = np.array(
[[np.cos(radian), -np.sin(radian), 0.],
[np.sin(radian), np.cos(radian), 0.], [0., 0., 1.]],
dtype=np.float32)
return rotation_matrix
@staticmethod
def _get_scaling_matrix(scale_ratio: float) -> np.ndarray:
scaling_matrix = np.array(
[[scale_ratio, 0., 0.], [0., scale_ratio, 0.], [0., 0., 1.]],
dtype=np.float32)
return scaling_matrix
@staticmethod
def _get_shear_matrix(x_shear_degrees: float,
y_shear_degrees: float) -> np.ndarray:
x_radian = math.radians(x_shear_degrees)
y_radian = math.radians(y_shear_degrees)
shear_matrix = np.array([[1, np.tan(x_radian), 0.],
[np.tan(y_radian), 1, 0.], [0., 0., 1.]],
dtype=np.float32)
return shear_matrix
@staticmethod
def _get_translation_matrix(x: float, y: float) -> np.ndarray:
translation_matrix = np.array([[1, 0., x], [0., 1, y], [0., 0., 1.]],
dtype=np.float32)
return translation_matrix