# Copyright (c) OpenMMLab. All rights reserved. import io import random import warnings from collections.abc import Sequence import mmcv import numpy as np import torch from mmcv.fileio import FileClient from easycv.datasets.registry import PIPELINES from easycv.datasets.shared.pipelines.format import to_tensor @PIPELINES.register_module() class DecordInit: """Using decord to initialize the video_reader. Decord: https://github.com/dmlc/decord Required keys are "filename", added or modified keys are "video_reader" and "total_frames". """ def __init__(self, io_backend='disk', num_threads=1, **kwargs): self.io_backend = io_backend self.num_threads = num_threads self.kwargs = kwargs self.file_client = None def __call__(self, results): """Perform the Decord initialization. Args: results (dict): The resulting dict to be modified and passed to the next transform in pipeline. """ try: import decord except ImportError: raise ImportError( 'Please run "pip install decord" to install Decord first.') if self.file_client is None: self.file_client = FileClient(self.io_backend, **self.kwargs) file_obj = io.BytesIO(self.file_client.get(results['filename'])) container = decord.VideoReader(file_obj, num_threads=self.num_threads) results['video_reader'] = container results['total_frames'] = len(container) return results def __repr__(self): repr_str = (f'{self.__class__.__name__}(' f'io_backend={self.io_backend}, ' f'num_threads={self.num_threads})') return @PIPELINES.register_module() class DecordDecode: """Using decord to decode the video. Decord: https://github.com/dmlc/decord Required keys are "video_reader", "filename" and "frame_inds", added or modified keys are "imgs" and "original_shape". """ def __call__(self, results): """Perform the Decord decoding. Args: results (dict): The resulting dict to be modified and passed to the next transform in pipeline. """ container = results['video_reader'] if results['frame_inds'].ndim != 1: results['frame_inds'] = np.squeeze(results['frame_inds']) frame_inds = results['frame_inds'] # Generate frame index mapping in order frame_dict = { idx: container[idx].asnumpy() for idx in np.unique(frame_inds) } imgs = [frame_dict[idx] for idx in frame_inds] results['video_reader'] = None del container results['imgs'] = imgs results['original_shape'] = imgs[0].shape[:2] results['img_shape'] = imgs[0].shape[:2] return results @PIPELINES.register_module() class SampleFrames: """Sample frames from the video. Required keys are "total_frames", "start_index" , added or modified keys are "frame_inds", "frame_interval" and "num_clips". Args: clip_len (int): Frames of each sampled output clip. frame_interval (int): Temporal interval of adjacent sampled frames. Default: 1. num_clips (int): Number of clips to be sampled. Default: 1. temporal_jitter (bool): Whether to apply temporal jittering. Default: False. twice_sample (bool): Whether to use twice sample when testing. If set to True, it will sample frames with and without fixed shift, which is commonly used for testing in TSM model. Default: False. out_of_bound_opt (str): The way to deal with out of bounds frame indexes. Available options are 'loop', 'repeat_last'. Default: 'loop'. test_mode (bool): Store True when building test or validation dataset. Default: False. """ def __init__(self, clip_len, frame_interval=1, num_clips=1, temporal_jitter=False, twice_sample=False, out_of_bound_opt='loop', test_mode=False, frame_uniform=False): self.clip_len = clip_len self.frame_interval = frame_interval self.num_clips = num_clips self.temporal_jitter = temporal_jitter self.twice_sample = twice_sample self.out_of_bound_opt = out_of_bound_opt self.test_mode = test_mode self.frame_uniform = frame_uniform assert self.out_of_bound_opt in ['loop', 'repeat_last'] def _get_train_clips(self, num_frames): """Get clip offsets in train mode. It will calculate the average interval for selected frames, and randomly shift them within offsets between [0, avg_interval]. If the total number of frames is smaller than clips num or origin frames length, it will return all zero indices. Args: num_frames (int): Total number of frame in the video. Returns: np.ndarray: Sampled frame indices in train mode. """ ori_clip_len = self.clip_len * self.frame_interval avg_interval = (num_frames - ori_clip_len + 1) // self.num_clips if avg_interval > 0: base_offsets = np.arange(self.num_clips) * avg_interval clip_offsets = base_offsets + np.random.randint( avg_interval, size=self.num_clips) elif num_frames > max(self.num_clips, ori_clip_len): clip_offsets = np.sort( np.random.randint( num_frames - ori_clip_len + 1, size=self.num_clips)) elif avg_interval == 0: ratio = (num_frames - ori_clip_len + 1.0) / self.num_clips clip_offsets = np.around(np.arange(self.num_clips) * ratio) else: clip_offsets = np.zeros((self.num_clips, ), dtype=np.int64) return clip_offsets def _get_test_clips(self, num_frames): """Get clip offsets in test mode. Calculate the average interval for selected frames, and shift them fixedly by avg_interval/2. If set twice_sample True, it will sample frames together without fixed shift. If the total number of frames is not enough, it will return all zero indices. Args: num_frames (int): Total number of frame in the video. Returns: np.ndarray: Sampled frame indices in test mode. """ ori_clip_len = self.clip_len * self.frame_interval avg_interval = (num_frames - ori_clip_len + 1) / float(self.num_clips) if num_frames > ori_clip_len - 1: base_offsets = np.arange(self.num_clips) * avg_interval clip_offsets = (base_offsets + avg_interval / 2.0).astype(np.int64) if self.twice_sample: clip_offsets = np.concatenate([clip_offsets, base_offsets]) else: clip_offsets = np.zeros((self.num_clips, ), dtype=np.int64) return clip_offsets def _sample_clips(self, num_frames): """Choose clip offsets for the video in a given mode. Args: num_frames (int): Total number of frame in the video. Returns: np.ndarray: Sampled frame indices. """ if self.test_mode: clip_offsets = self._get_test_clips(num_frames) else: clip_offsets = self._get_train_clips(num_frames) return clip_offsets def get_seq_frames(self, num_frames): """ Modified from https://github.com/facebookresearch/SlowFast/blob/64abcc90ccfdcbb11cf91d6e525bed60e92a8796/slowfast/datasets/ssv2.py#L159 Given the video index, return the list of sampled frame indexes. Args: num_frames (int): Total number of frame in the video. Returns: seq (list): the indexes of frames of sampled from the video. """ seg_size = float(num_frames - 1) / self.clip_len seq = [] for i in range(self.clip_len): start = int(np.round(seg_size * i)) end = int(np.round(seg_size * (i + 1))) if not self.test_mode: seq.append(random.randint(start, end)) else: seq.append((start + end) // 2) return np.array(seq) def __call__(self, results): """Perform the SampleFrames loading. Args: results (dict): The resulting dict to be modified and passed to the next transform in pipeline. """ total_frames = results['total_frames'] if self.frame_uniform: # sthv2 sampling strategy assert results['start_index'] == 0 frame_inds = self.get_seq_frames(total_frames) else: clip_offsets = self._sample_clips(total_frames) frame_inds = clip_offsets[:, None] + np.arange( self.clip_len)[None, :] * self.frame_interval frame_inds = np.concatenate(frame_inds) if self.temporal_jitter: perframe_offsets = np.random.randint( self.frame_interval, size=len(frame_inds)) frame_inds += perframe_offsets frame_inds = frame_inds.reshape((-1, self.clip_len)) if self.out_of_bound_opt == 'loop': frame_inds = np.mod(frame_inds, total_frames) elif self.out_of_bound_opt == 'repeat_last': safe_inds = frame_inds < total_frames unsafe_inds = 1 - safe_inds last_ind = np.max(safe_inds * frame_inds, axis=1) new_inds = ( safe_inds * frame_inds + (unsafe_inds.T * last_ind).T) frame_inds = new_inds else: raise ValueError('Illegal out_of_bound option.') start_index = results['start_index'] frame_inds = np.concatenate(frame_inds) + start_index results['frame_inds'] = frame_inds.astype(np.int64) results['clip_len'] = self.clip_len results['frame_interval'] = self.frame_interval results['num_clips'] = self.num_clips return results def __repr__(self): repr_str = (f'{self.__class__.__name__}(' f'clip_len={self.clip_len}, ' f'frame_interval={self.frame_interval}, ' f'num_clips={self.num_clips}, ' f'temporal_jitter={self.temporal_jitter}, ' f'twice_sample={self.twice_sample}, ' f'out_of_bound_opt={self.out_of_bound_opt}, ' f'test_mode={self.test_mode})') return repr_str @PIPELINES.register_module() class VideoToTensor: """Convert some values in results dict to `torch.Tensor` type in data loader pipeline. Args: keys (Sequence[str]): Required keys to be converted. """ def __init__(self, keys): self.keys = keys def __call__(self, results): """Performs the ToTensor formating. Args: results (dict): The resulting dict to be modified and passed to the next transform in pipeline. """ for key in self.keys: results[key] = to_tensor(results[key]) return results def __repr__(self): return f'{self.__class__.__name__}(keys={self.keys})' @PIPELINES.register_module() class FormatShape: """Format final imgs shape to the given input_format. Required keys are "imgs", "num_clips" and "clip_len", added or modified keys are "imgs" and "input_shape". Args: input_format (str): Define the final imgs format. collapse (bool): To collpase input_format N... to ... (NCTHW to CTHW, etc.) if N is 1. Should be set as True when training and testing detectors. Default: False. """ def __init__(self, input_format, collapse=False): self.input_format = input_format self.collapse = collapse if self.input_format not in ['NCTHW', 'NCHW', 'NCHW_Flow', 'NPTCHW']: raise ValueError( f'The input format {self.input_format} is invalid.') def __call__(self, results): """Performs the FormatShape formating. Args: results (dict): The resulting dict to be modified and passed to the next transform in pipeline. """ if not isinstance(results['imgs'], np.ndarray): results['imgs'] = np.array(results['imgs']) imgs = results['imgs'] # [M x H x W x C] # M = 1 * N_crops * N_clips * L if self.collapse: assert results['num_clips'] == 1 if self.input_format == 'NCTHW': num_clips = results['num_clips'] clip_len = results['clip_len'] imgs = imgs.reshape((-1, num_clips, clip_len) + imgs.shape[1:]) # N_crops x N_clips x L x H x W x C imgs = np.transpose(imgs, (0, 1, 5, 2, 3, 4)) # N_crops x N_clips x C x L x H x W imgs = imgs.reshape((-1, ) + imgs.shape[2:]) # M' x C x L x H x W # M' = N_crops x N_clips elif self.input_format == 'NCHW': imgs = np.transpose(imgs, (0, 3, 1, 2)) # M x C x H x W elif self.input_format == 'NCHW_Flow': num_clips = results['num_clips'] clip_len = results['clip_len'] imgs = imgs.reshape((-1, num_clips, clip_len) + imgs.shape[1:]) # N_crops x N_clips x L x H x W x C imgs = np.transpose(imgs, (0, 1, 2, 5, 3, 4)) # N_crops x N_clips x L x C x H x W imgs = imgs.reshape((-1, imgs.shape[2] * imgs.shape[3]) + imgs.shape[4:]) # M' x C' x H x W # M' = N_crops x N_clips # C' = L x C elif self.input_format == 'NPTCHW': num_proposals = results['num_proposals'] num_clips = results['num_clips'] clip_len = results['clip_len'] imgs = imgs.reshape((num_proposals, num_clips * clip_len) + imgs.shape[1:]) # P x M x H x W x C # M = N_clips x L imgs = np.transpose(imgs, (0, 1, 4, 2, 3)) # P x M x C x H x W if self.collapse: assert imgs.shape[0] == 1 imgs = imgs.squeeze(0) results['imgs'] = imgs results['input_shape'] = imgs.shape return results def __repr__(self): repr_str = self.__class__.__name__ repr_str += f"(input_format='{self.input_format}')" return