mirror of
https://github.com/alibaba/EasyCV.git
synced 2025-06-03 14:49:00 +08:00
386 lines
14 KiB
Python
386 lines
14 KiB
Python
# Copyright (c) OpenMMLab. All rights reserved.
|
|
import io
|
|
import random
|
|
import warnings
|
|
from collections.abc import Sequence
|
|
|
|
import mmcv
|
|
import numpy as np
|
|
import torch
|
|
from mmcv.fileio import FileClient
|
|
|
|
from easycv.datasets.registry import PIPELINES
|
|
from easycv.datasets.shared.pipelines.format import to_tensor
|
|
|
|
|
|
@PIPELINES.register_module()
|
|
class DecordInit:
|
|
"""Using decord to initialize the video_reader.
|
|
Decord: https://github.com/dmlc/decord
|
|
Required keys are "filename",
|
|
added or modified keys are "video_reader" and "total_frames".
|
|
"""
|
|
|
|
def __init__(self, io_backend='disk', num_threads=1, **kwargs):
|
|
self.io_backend = io_backend
|
|
self.num_threads = num_threads
|
|
self.kwargs = kwargs
|
|
self.file_client = None
|
|
|
|
def __call__(self, results):
|
|
"""Perform the Decord initialization.
|
|
Args:
|
|
results (dict): The resulting dict to be modified and passed
|
|
to the next transform in pipeline.
|
|
"""
|
|
try:
|
|
import decord
|
|
except ImportError:
|
|
raise ImportError(
|
|
'Please run "pip install decord" to install Decord first.')
|
|
|
|
if self.file_client is None:
|
|
self.file_client = FileClient(self.io_backend, **self.kwargs)
|
|
file_obj = io.BytesIO(self.file_client.get(results['filename']))
|
|
container = decord.VideoReader(file_obj, num_threads=self.num_threads)
|
|
results['video_reader'] = container
|
|
results['total_frames'] = len(container)
|
|
return results
|
|
|
|
def __repr__(self):
|
|
repr_str = (f'{self.__class__.__name__}('
|
|
f'io_backend={self.io_backend}, '
|
|
f'num_threads={self.num_threads})')
|
|
return
|
|
|
|
|
|
@PIPELINES.register_module()
|
|
class DecordDecode:
|
|
"""Using decord to decode the video.
|
|
Decord: https://github.com/dmlc/decord
|
|
Required keys are "video_reader", "filename" and "frame_inds",
|
|
added or modified keys are "imgs" and "original_shape".
|
|
"""
|
|
|
|
def __call__(self, results):
|
|
"""Perform the Decord decoding.
|
|
Args:
|
|
results (dict): The resulting dict to be modified and passed
|
|
to the next transform in pipeline.
|
|
"""
|
|
|
|
container = results['video_reader']
|
|
|
|
if results['frame_inds'].ndim != 1:
|
|
results['frame_inds'] = np.squeeze(results['frame_inds'])
|
|
|
|
frame_inds = results['frame_inds']
|
|
# Generate frame index mapping in order
|
|
frame_dict = {
|
|
idx: container[idx].asnumpy()
|
|
for idx in np.unique(frame_inds)
|
|
}
|
|
|
|
imgs = [frame_dict[idx] for idx in frame_inds]
|
|
|
|
results['video_reader'] = None
|
|
del container
|
|
|
|
results['imgs'] = imgs
|
|
results['original_shape'] = imgs[0].shape[:2]
|
|
results['img_shape'] = imgs[0].shape[:2]
|
|
|
|
return results
|
|
|
|
|
|
@PIPELINES.register_module()
|
|
class SampleFrames:
|
|
"""Sample frames from the video.
|
|
Required keys are "total_frames", "start_index" , added or modified keys
|
|
are "frame_inds", "frame_interval" and "num_clips".
|
|
Args:
|
|
clip_len (int): Frames of each sampled output clip.
|
|
frame_interval (int): Temporal interval of adjacent sampled frames.
|
|
Default: 1.
|
|
num_clips (int): Number of clips to be sampled. Default: 1.
|
|
temporal_jitter (bool): Whether to apply temporal jittering.
|
|
Default: False.
|
|
twice_sample (bool): Whether to use twice sample when testing.
|
|
If set to True, it will sample frames with and without fixed shift,
|
|
which is commonly used for testing in TSM model. Default: False.
|
|
out_of_bound_opt (str): The way to deal with out of bounds frame
|
|
indexes. Available options are 'loop', 'repeat_last'.
|
|
Default: 'loop'.
|
|
test_mode (bool): Store True when building test or validation dataset.
|
|
Default: False.
|
|
"""
|
|
|
|
def __init__(self,
|
|
clip_len,
|
|
frame_interval=1,
|
|
num_clips=1,
|
|
temporal_jitter=False,
|
|
twice_sample=False,
|
|
out_of_bound_opt='loop',
|
|
test_mode=False,
|
|
frame_uniform=False):
|
|
|
|
self.clip_len = clip_len
|
|
self.frame_interval = frame_interval
|
|
self.num_clips = num_clips
|
|
self.temporal_jitter = temporal_jitter
|
|
self.twice_sample = twice_sample
|
|
self.out_of_bound_opt = out_of_bound_opt
|
|
self.test_mode = test_mode
|
|
self.frame_uniform = frame_uniform
|
|
assert self.out_of_bound_opt in ['loop', 'repeat_last']
|
|
|
|
def _get_train_clips(self, num_frames):
|
|
"""Get clip offsets in train mode.
|
|
It will calculate the average interval for selected frames,
|
|
and randomly shift them within offsets between [0, avg_interval].
|
|
If the total number of frames is smaller than clips num or origin
|
|
frames length, it will return all zero indices.
|
|
Args:
|
|
num_frames (int): Total number of frame in the video.
|
|
Returns:
|
|
np.ndarray: Sampled frame indices in train mode.
|
|
"""
|
|
ori_clip_len = self.clip_len * self.frame_interval
|
|
avg_interval = (num_frames - ori_clip_len + 1) // self.num_clips
|
|
|
|
if avg_interval > 0:
|
|
base_offsets = np.arange(self.num_clips) * avg_interval
|
|
clip_offsets = base_offsets + np.random.randint(
|
|
avg_interval, size=self.num_clips)
|
|
elif num_frames > max(self.num_clips, ori_clip_len):
|
|
clip_offsets = np.sort(
|
|
np.random.randint(
|
|
num_frames - ori_clip_len + 1, size=self.num_clips))
|
|
elif avg_interval == 0:
|
|
ratio = (num_frames - ori_clip_len + 1.0) / self.num_clips
|
|
clip_offsets = np.around(np.arange(self.num_clips) * ratio)
|
|
else:
|
|
clip_offsets = np.zeros((self.num_clips, ), dtype=np.int64)
|
|
|
|
return clip_offsets
|
|
|
|
def _get_test_clips(self, num_frames):
|
|
"""Get clip offsets in test mode.
|
|
Calculate the average interval for selected frames, and shift them
|
|
fixedly by avg_interval/2. If set twice_sample True, it will sample
|
|
frames together without fixed shift. If the total number of frames is
|
|
not enough, it will return all zero indices.
|
|
Args:
|
|
num_frames (int): Total number of frame in the video.
|
|
Returns:
|
|
np.ndarray: Sampled frame indices in test mode.
|
|
"""
|
|
ori_clip_len = self.clip_len * self.frame_interval
|
|
avg_interval = (num_frames - ori_clip_len + 1) / float(self.num_clips)
|
|
if num_frames > ori_clip_len - 1:
|
|
base_offsets = np.arange(self.num_clips) * avg_interval
|
|
clip_offsets = (base_offsets + avg_interval / 2.0).astype(np.int64)
|
|
if self.twice_sample:
|
|
clip_offsets = np.concatenate([clip_offsets, base_offsets])
|
|
else:
|
|
clip_offsets = np.zeros((self.num_clips, ), dtype=np.int64)
|
|
return clip_offsets
|
|
|
|
def _sample_clips(self, num_frames):
|
|
"""Choose clip offsets for the video in a given mode.
|
|
Args:
|
|
num_frames (int): Total number of frame in the video.
|
|
Returns:
|
|
np.ndarray: Sampled frame indices.
|
|
"""
|
|
if self.test_mode:
|
|
clip_offsets = self._get_test_clips(num_frames)
|
|
else:
|
|
clip_offsets = self._get_train_clips(num_frames)
|
|
|
|
return clip_offsets
|
|
|
|
def get_seq_frames(self, num_frames):
|
|
"""
|
|
Modified from https://github.com/facebookresearch/SlowFast/blob/64abcc90ccfdcbb11cf91d6e525bed60e92a8796/slowfast/datasets/ssv2.py#L159
|
|
Given the video index, return the list of sampled frame indexes.
|
|
Args:
|
|
num_frames (int): Total number of frame in the video.
|
|
Returns:
|
|
seq (list): the indexes of frames of sampled from the video.
|
|
"""
|
|
seg_size = float(num_frames - 1) / self.clip_len
|
|
seq = []
|
|
for i in range(self.clip_len):
|
|
start = int(np.round(seg_size * i))
|
|
end = int(np.round(seg_size * (i + 1)))
|
|
if not self.test_mode:
|
|
seq.append(random.randint(start, end))
|
|
else:
|
|
seq.append((start + end) // 2)
|
|
|
|
return np.array(seq)
|
|
|
|
def __call__(self, results):
|
|
"""Perform the SampleFrames loading.
|
|
Args:
|
|
results (dict): The resulting dict to be modified and passed
|
|
to the next transform in pipeline.
|
|
"""
|
|
total_frames = results['total_frames']
|
|
if self.frame_uniform: # sthv2 sampling strategy
|
|
assert results['start_index'] == 0
|
|
frame_inds = self.get_seq_frames(total_frames)
|
|
else:
|
|
clip_offsets = self._sample_clips(total_frames)
|
|
frame_inds = clip_offsets[:, None] + np.arange(
|
|
self.clip_len)[None, :] * self.frame_interval
|
|
frame_inds = np.concatenate(frame_inds)
|
|
|
|
if self.temporal_jitter:
|
|
perframe_offsets = np.random.randint(
|
|
self.frame_interval, size=len(frame_inds))
|
|
frame_inds += perframe_offsets
|
|
|
|
frame_inds = frame_inds.reshape((-1, self.clip_len))
|
|
if self.out_of_bound_opt == 'loop':
|
|
frame_inds = np.mod(frame_inds, total_frames)
|
|
elif self.out_of_bound_opt == 'repeat_last':
|
|
safe_inds = frame_inds < total_frames
|
|
unsafe_inds = 1 - safe_inds
|
|
last_ind = np.max(safe_inds * frame_inds, axis=1)
|
|
new_inds = (
|
|
safe_inds * frame_inds + (unsafe_inds.T * last_ind).T)
|
|
frame_inds = new_inds
|
|
else:
|
|
raise ValueError('Illegal out_of_bound option.')
|
|
|
|
start_index = results['start_index']
|
|
frame_inds = np.concatenate(frame_inds) + start_index
|
|
|
|
results['frame_inds'] = frame_inds.astype(np.int64)
|
|
results['clip_len'] = self.clip_len
|
|
results['frame_interval'] = self.frame_interval
|
|
results['num_clips'] = self.num_clips
|
|
return results
|
|
|
|
def __repr__(self):
|
|
repr_str = (f'{self.__class__.__name__}('
|
|
f'clip_len={self.clip_len}, '
|
|
f'frame_interval={self.frame_interval}, '
|
|
f'num_clips={self.num_clips}, '
|
|
f'temporal_jitter={self.temporal_jitter}, '
|
|
f'twice_sample={self.twice_sample}, '
|
|
f'out_of_bound_opt={self.out_of_bound_opt}, '
|
|
f'test_mode={self.test_mode})')
|
|
return repr_str
|
|
|
|
|
|
@PIPELINES.register_module()
|
|
class VideoToTensor:
|
|
"""Convert some values in results dict to `torch.Tensor` type in data
|
|
loader pipeline.
|
|
Args:
|
|
keys (Sequence[str]): Required keys to be converted.
|
|
"""
|
|
|
|
def __init__(self, keys):
|
|
self.keys = keys
|
|
|
|
def __call__(self, results):
|
|
"""Performs the ToTensor formating.
|
|
Args:
|
|
results (dict): The resulting dict to be modified and passed
|
|
to the next transform in pipeline.
|
|
"""
|
|
for key in self.keys:
|
|
results[key] = to_tensor(results[key])
|
|
return results
|
|
|
|
def __repr__(self):
|
|
return f'{self.__class__.__name__}(keys={self.keys})'
|
|
|
|
|
|
@PIPELINES.register_module()
|
|
class FormatShape:
|
|
"""Format final imgs shape to the given input_format.
|
|
Required keys are "imgs", "num_clips" and "clip_len", added or modified
|
|
keys are "imgs" and "input_shape".
|
|
Args:
|
|
input_format (str): Define the final imgs format.
|
|
collapse (bool): To collpase input_format N... to ... (NCTHW to CTHW,
|
|
etc.) if N is 1. Should be set as True when training and testing
|
|
detectors. Default: False.
|
|
"""
|
|
|
|
def __init__(self, input_format, collapse=False):
|
|
self.input_format = input_format
|
|
self.collapse = collapse
|
|
if self.input_format not in ['NCTHW', 'NCHW', 'NCHW_Flow', 'NPTCHW']:
|
|
raise ValueError(
|
|
f'The input format {self.input_format} is invalid.')
|
|
|
|
def __call__(self, results):
|
|
"""Performs the FormatShape formating.
|
|
Args:
|
|
results (dict): The resulting dict to be modified and passed
|
|
to the next transform in pipeline.
|
|
"""
|
|
if not isinstance(results['imgs'], np.ndarray):
|
|
results['imgs'] = np.array(results['imgs'])
|
|
imgs = results['imgs']
|
|
# [M x H x W x C]
|
|
# M = 1 * N_crops * N_clips * L
|
|
if self.collapse:
|
|
assert results['num_clips'] == 1
|
|
|
|
if self.input_format == 'NCTHW':
|
|
num_clips = results['num_clips']
|
|
clip_len = results['clip_len']
|
|
|
|
imgs = imgs.reshape((-1, num_clips, clip_len) + imgs.shape[1:])
|
|
# N_crops x N_clips x L x H x W x C
|
|
imgs = np.transpose(imgs, (0, 1, 5, 2, 3, 4))
|
|
# N_crops x N_clips x C x L x H x W
|
|
imgs = imgs.reshape((-1, ) + imgs.shape[2:])
|
|
# M' x C x L x H x W
|
|
# M' = N_crops x N_clips
|
|
elif self.input_format == 'NCHW':
|
|
imgs = np.transpose(imgs, (0, 3, 1, 2))
|
|
# M x C x H x W
|
|
elif self.input_format == 'NCHW_Flow':
|
|
num_clips = results['num_clips']
|
|
clip_len = results['clip_len']
|
|
imgs = imgs.reshape((-1, num_clips, clip_len) + imgs.shape[1:])
|
|
# N_crops x N_clips x L x H x W x C
|
|
imgs = np.transpose(imgs, (0, 1, 2, 5, 3, 4))
|
|
# N_crops x N_clips x L x C x H x W
|
|
imgs = imgs.reshape((-1, imgs.shape[2] * imgs.shape[3]) +
|
|
imgs.shape[4:])
|
|
# M' x C' x H x W
|
|
# M' = N_crops x N_clips
|
|
# C' = L x C
|
|
elif self.input_format == 'NPTCHW':
|
|
num_proposals = results['num_proposals']
|
|
num_clips = results['num_clips']
|
|
clip_len = results['clip_len']
|
|
imgs = imgs.reshape((num_proposals, num_clips * clip_len) +
|
|
imgs.shape[1:])
|
|
# P x M x H x W x C
|
|
# M = N_clips x L
|
|
imgs = np.transpose(imgs, (0, 1, 4, 2, 3))
|
|
# P x M x C x H x W
|
|
if self.collapse:
|
|
assert imgs.shape[0] == 1
|
|
imgs = imgs.squeeze(0)
|
|
|
|
results['imgs'] = imgs
|
|
results['input_shape'] = imgs.shape
|
|
return results
|
|
|
|
def __repr__(self):
|
|
repr_str = self.__class__.__name__
|
|
repr_str += f"(input_format='{self.input_format}')"
|
|
return
|