mmclassification/mmpretrain/datasets/transforms/formatting.py

# Copyright (c) OpenMMLab. All rights reserved.
from collections import defaultdict
from collections.abc import Sequence

import cv2
import numpy as np
import torch
import torchvision.transforms.functional as F
from mmcv.transforms import BaseTransform
from mmengine.utils import is_str
from PIL import Image

from mmpretrain.registry import TRANSFORMS
from mmpretrain.structures import DataSample, MultiTaskDataSample


def to_tensor(data):
    """Convert objects of various python types to :obj:`torch.Tensor`.

    Supported types are: :class:`numpy.ndarray`, :class:`torch.Tensor`,
    :class:`Sequence`, :class:`int` and :class:`float`.
    """
    if isinstance(data, torch.Tensor):
        return data
    elif isinstance(data, np.ndarray):
        return torch.from_numpy(data)
    elif isinstance(data, Sequence) and not is_str(data):
        return torch.tensor(data)
    elif isinstance(data, int):
        return torch.LongTensor([data])
    elif isinstance(data, float):
        return torch.FloatTensor([data])
    else:
        raise TypeError(
            f'Type {type(data)} cannot be converted to tensor.'
            'Supported types are: `numpy.ndarray`, `torch.Tensor`, '
            '`Sequence`, `int` and `float`')


@TRANSFORMS.register_module()
class PackInputs(BaseTransform):
    """Pack the inputs data.

    **Required Keys:**

    - ``input_key``
    - ``*algorithm_keys``
    - ``*meta_keys``

    **Deleted Keys:**

    All other keys in the dict.

    **Added Keys:**

    - inputs (:obj:`torch.Tensor`): The forward data of models.
    - data_samples (:obj:`~mmpretrain.structures.DataSample`): The
      annotation info of the sample.

    Args:
        input_key (str): The key of element to feed into the model forwarding.
            Defaults to 'img'.
        algorithm_keys (Sequence[str]): The keys of custom elements to be used
            in the algorithm. Defaults to an empty tuple.
        meta_keys (Sequence[str]): The keys of meta information to be saved in
            the data sample. Defaults to :attr:`PackInputs.DEFAULT_META_KEYS`.

    .. admonition:: Default algorithm keys

        Besides the specified ``algorithm_keys``, we will set some default keys
        into the output data sample and do some formatting. Therefore, you
        don't need to set these keys in the ``algorithm_keys``.

        - ``gt_label``: The ground-truth label. The value will be converted
          into a 1-D tensor.
        - ``gt_score``: The ground-truth score. The value will be converted
          into a 1-D tensor.
        - ``mask``: The mask for some self-supervise tasks. The value will
          be converted into a tensor.

    .. admonition:: Default meta keys

        - ``sample_idx``: The id of the image sample.
        - ``img_path``: The path to the image file.
        - ``ori_shape``: The original shape of the image as a tuple (H, W).
        - ``img_shape``: The shape of the image after the pipeline as a
          tuple (H, W).
        - ``scale_factor``: The scale factor between the resized image and
          the original image.
        - ``flip``: A boolean indicating if image flip transform was used.
        - ``flip_direction``: The flipping direction.
    """

    DEFAULT_META_KEYS = ('sample_idx', 'img_path', 'ori_shape', 'img_shape',
                         'scale_factor', 'flip', 'flip_direction')

    def __init__(self,
                 input_key='img',
                 algorithm_keys=(),
                 meta_keys=DEFAULT_META_KEYS):
        self.input_key = input_key
        self.algorithm_keys = algorithm_keys
        self.meta_keys = meta_keys

    @staticmethod
    def format_input(input_):
        if isinstance(input_, list):
            return [PackInputs.format_input(item) for item in input_]
        elif isinstance(input_, np.ndarray):
            if input_.ndim == 2:  # For grayscale image.
                input_ = np.expand_dims(input_, -1)
            if input_.ndim == 3 and not input_.flags.c_contiguous:
                input_ = np.ascontiguousarray(input_.transpose(2, 0, 1))
                input_ = to_tensor(input_)
            elif input_.ndim == 3:
                # convert to tensor first to accelerate, see
                # https://github.com/open-mmlab/mmdetection/pull/9533
                input_ = to_tensor(input_).permute(2, 0, 1).contiguous()
            else:
                # convert input with other shape to tensor without permute,
                # like video input (num_crops, C, T, H, W).
                input_ = to_tensor(input_)
        elif isinstance(input_, Image.Image):
            input_ = F.pil_to_tensor(input_)
        elif not isinstance(input_, torch.Tensor):
            raise TypeError(f'Unsupported input type {type(input_)}.')

        return input_

    def transform(self, results: dict) -> dict:
        """Method to pack the input data."""
        packed_results = dict()
        if self.input_key in results:
            input_ = results[self.input_key]
            packed_results['inputs'] = self.format_input(input_)

        data_sample = DataSample()

        # Set default keys
        if 'gt_label' in results:
            data_sample.set_gt_label(results['gt_label'])
        if 'gt_score' in results:
            data_sample.set_gt_score(results['gt_score'])
        if 'mask' in results:
            data_sample.set_mask(results['mask'])

        # Set custom algorithm keys
        for key in self.algorithm_keys:
            if key in results:
                data_sample.set_field(results[key], key)

        # Set meta keys
        for key in self.meta_keys:
            if key in results:
                data_sample.set_field(results[key], key, field_type='metainfo')

        packed_results['data_samples'] = data_sample
        return packed_results

    def __repr__(self) -> str:
        repr_str = self.__class__.__name__
        repr_str += f"(input_key='{self.input_key}', "
        repr_str += f'algorithm_keys={self.algorithm_keys}, '
        repr_str += f'meta_keys={self.meta_keys})'
        return repr_str


@TRANSFORMS.register_module()
class PackMultiTaskInputs(BaseTransform):
    """Convert all image labels of multi-task dataset to a dict of tensor.

    Args:
        multi_task_fields (Sequence[str]):
        input_key (str):
        task_handlers (dict):
    """

    def __init__(self,
                 multi_task_fields,
                 input_key='img',
                 task_handlers=dict()):
        self.multi_task_fields = multi_task_fields
        self.input_key = input_key
        self.task_handlers = defaultdict(PackInputs)
        for task_name, task_handler in task_handlers.items():
            self.task_handlers[task_name] = TRANSFORMS.build(task_handler)

    def transform(self, results: dict) -> dict:
        """Method to pack the input data.

        result = {'img_path': 'a.png', 'gt_label': {'task1': 1, 'task3': 3},
            'img': array([[[  0,   0,   0])
        """
        packed_results = dict()
        results = results.copy()

        if self.input_key in results:
            input_ = results[self.input_key]
            packed_results['inputs'] = PackInputs.format_input(input_)

        task_results = defaultdict(dict)
        for field in self.multi_task_fields:
            if field in results:
                value = results.pop(field)
                for k, v in value.items():
                    task_results[k].update({field: v})

        data_sample = MultiTaskDataSample()
        for task_name, task_result in task_results.items():
            task_handler = self.task_handlers[task_name]
            task_pack_result = task_handler({**results, **task_result})
            data_sample.set_field(task_pack_result['data_samples'], task_name)

        packed_results['data_samples'] = data_sample
        return packed_results

    def __repr__(self):
        repr = self.__class__.__name__
        task_handlers = ', '.join(
            f"'{name}': {handler.__class__.__name__}"
            for name, handler in self.task_handlers.items())
        repr += f'(multi_task_fields={self.multi_task_fields}, '
        repr += f"input_key='{self.input_key}', "
        repr += f'task_handlers={{{task_handlers}}})'
        return repr


@TRANSFORMS.register_module()
class Transpose(BaseTransform):
    """Transpose numpy array.

    **Required Keys:**

    - ``*keys``

    **Modified Keys:**

    - ``*keys``

    Args:
        keys (List[str]): The fields to convert to tensor.
        order (List[int]): The output dimensions order.
    """

    def __init__(self, keys, order):
        self.keys = keys
        self.order = order

    def transform(self, results):
        """Method to transpose array."""
        for key in self.keys:
            results[key] = results[key].transpose(self.order)
        return results

    def __repr__(self):
        return self.__class__.__name__ + \
            f'(keys={self.keys}, order={self.order})'


@TRANSFORMS.register_module(('NumpyToPIL', 'ToPIL'))
class NumpyToPIL(BaseTransform):
    """Convert the image from OpenCV format to :obj:`PIL.Image.Image`.

    **Required Keys:**

    - ``img``

    **Modified Keys:**

    - ``img``

    Args:
        to_rgb (bool): Whether to convert img to rgb. Defaults to True.
    """

    def __init__(self, to_rgb: bool = False) -> None:
        self.to_rgb = to_rgb

    def transform(self, results: dict) -> dict:
        """Method to convert images to :obj:`PIL.Image.Image`."""
        img = results['img']
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) if self.to_rgb else img

        results['img'] = Image.fromarray(img)
        return results

    def __repr__(self) -> str:
        return self.__class__.__name__ + f'(to_rgb={self.to_rgb})'


@TRANSFORMS.register_module(('PILToNumpy', 'ToNumpy'))
class PILToNumpy(BaseTransform):
    """Convert img to :obj:`numpy.ndarray`.

    **Required Keys:**

    - ``img``

    **Modified Keys:**

    - ``img``

    Args:
        to_bgr (bool): Whether to convert img to rgb. Defaults to True.
        dtype (str, optional): The dtype of the converted numpy array.
            Defaults to None.
    """

    def __init__(self, to_bgr: bool = False, dtype=None) -> None:
        self.to_bgr = to_bgr
        self.dtype = dtype

    def transform(self, results: dict) -> dict:
        """Method to convert img to :obj:`numpy.ndarray`."""
        img = np.array(results['img'], dtype=self.dtype)
        img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR) if self.to_bgr else img

        results['img'] = img
        return results

    def __repr__(self) -> str:
        return self.__class__.__name__ + \
            f'(to_bgr={self.to_bgr}, dtype={self.dtype})'


@TRANSFORMS.register_module()
class Collect(BaseTransform):
    """Collect and only reserve the specified fields.

    **Required Keys:**

    - ``*keys``

    **Deleted Keys:**

    All keys except those in the argument ``*keys``.

    Args:
        keys (Sequence[str]): The keys of the fields to be collected.
    """

    def __init__(self, keys):
        self.keys = keys

    def transform(self, results):
        data = {}
        for key in self.keys:
            data[key] = results[key]
        return data

    def __repr__(self):
        return self.__class__.__name__ + f'(keys={self.keys})'