Cathy0908 a9ee453d04
fix numpy version compatibility (#325)
* fix numpy version compatibility and update runtime.txt
2023-08-16 20:12:48 +08:00

386 lines
14 KiB
Python

# Copyright (c) OpenMMLab. All rights reserved.
import io
import random
import warnings
from collections.abc import Sequence
import mmcv
import numpy as np
import torch
from mmcv.fileio import FileClient
from easycv.datasets.registry import PIPELINES
from easycv.datasets.shared.pipelines.format import to_tensor
@PIPELINES.register_module()
class DecordInit:
"""Using decord to initialize the video_reader.
Decord: https://github.com/dmlc/decord
Required keys are "filename",
added or modified keys are "video_reader" and "total_frames".
"""
def __init__(self, io_backend='disk', num_threads=1, **kwargs):
self.io_backend = io_backend
self.num_threads = num_threads
self.kwargs = kwargs
self.file_client = None
def __call__(self, results):
"""Perform the Decord initialization.
Args:
results (dict): The resulting dict to be modified and passed
to the next transform in pipeline.
"""
try:
import decord
except ImportError:
raise ImportError(
'Please run "pip install decord" to install Decord first.')
if self.file_client is None:
self.file_client = FileClient(self.io_backend, **self.kwargs)
file_obj = io.BytesIO(self.file_client.get(results['filename']))
container = decord.VideoReader(file_obj, num_threads=self.num_threads)
results['video_reader'] = container
results['total_frames'] = len(container)
return results
def __repr__(self):
repr_str = (f'{self.__class__.__name__}('
f'io_backend={self.io_backend}, '
f'num_threads={self.num_threads})')
return
@PIPELINES.register_module()
class DecordDecode:
"""Using decord to decode the video.
Decord: https://github.com/dmlc/decord
Required keys are "video_reader", "filename" and "frame_inds",
added or modified keys are "imgs" and "original_shape".
"""
def __call__(self, results):
"""Perform the Decord decoding.
Args:
results (dict): The resulting dict to be modified and passed
to the next transform in pipeline.
"""
container = results['video_reader']
if results['frame_inds'].ndim != 1:
results['frame_inds'] = np.squeeze(results['frame_inds'])
frame_inds = results['frame_inds']
# Generate frame index mapping in order
frame_dict = {
idx: container[idx].asnumpy()
for idx in np.unique(frame_inds)
}
imgs = [frame_dict[idx] for idx in frame_inds]
results['video_reader'] = None
del container
results['imgs'] = imgs
results['original_shape'] = imgs[0].shape[:2]
results['img_shape'] = imgs[0].shape[:2]
return results
@PIPELINES.register_module()
class SampleFrames:
"""Sample frames from the video.
Required keys are "total_frames", "start_index" , added or modified keys
are "frame_inds", "frame_interval" and "num_clips".
Args:
clip_len (int): Frames of each sampled output clip.
frame_interval (int): Temporal interval of adjacent sampled frames.
Default: 1.
num_clips (int): Number of clips to be sampled. Default: 1.
temporal_jitter (bool): Whether to apply temporal jittering.
Default: False.
twice_sample (bool): Whether to use twice sample when testing.
If set to True, it will sample frames with and without fixed shift,
which is commonly used for testing in TSM model. Default: False.
out_of_bound_opt (str): The way to deal with out of bounds frame
indexes. Available options are 'loop', 'repeat_last'.
Default: 'loop'.
test_mode (bool): Store True when building test or validation dataset.
Default: False.
"""
def __init__(self,
clip_len,
frame_interval=1,
num_clips=1,
temporal_jitter=False,
twice_sample=False,
out_of_bound_opt='loop',
test_mode=False,
frame_uniform=False):
self.clip_len = clip_len
self.frame_interval = frame_interval
self.num_clips = num_clips
self.temporal_jitter = temporal_jitter
self.twice_sample = twice_sample
self.out_of_bound_opt = out_of_bound_opt
self.test_mode = test_mode
self.frame_uniform = frame_uniform
assert self.out_of_bound_opt in ['loop', 'repeat_last']
def _get_train_clips(self, num_frames):
"""Get clip offsets in train mode.
It will calculate the average interval for selected frames,
and randomly shift them within offsets between [0, avg_interval].
If the total number of frames is smaller than clips num or origin
frames length, it will return all zero indices.
Args:
num_frames (int): Total number of frame in the video.
Returns:
np.ndarray: Sampled frame indices in train mode.
"""
ori_clip_len = self.clip_len * self.frame_interval
avg_interval = (num_frames - ori_clip_len + 1) // self.num_clips
if avg_interval > 0:
base_offsets = np.arange(self.num_clips) * avg_interval
clip_offsets = base_offsets + np.random.randint(
avg_interval, size=self.num_clips)
elif num_frames > max(self.num_clips, ori_clip_len):
clip_offsets = np.sort(
np.random.randint(
num_frames - ori_clip_len + 1, size=self.num_clips))
elif avg_interval == 0:
ratio = (num_frames - ori_clip_len + 1.0) / self.num_clips
clip_offsets = np.around(np.arange(self.num_clips) * ratio)
else:
clip_offsets = np.zeros((self.num_clips, ), dtype=np.int64)
return clip_offsets
def _get_test_clips(self, num_frames):
"""Get clip offsets in test mode.
Calculate the average interval for selected frames, and shift them
fixedly by avg_interval/2. If set twice_sample True, it will sample
frames together without fixed shift. If the total number of frames is
not enough, it will return all zero indices.
Args:
num_frames (int): Total number of frame in the video.
Returns:
np.ndarray: Sampled frame indices in test mode.
"""
ori_clip_len = self.clip_len * self.frame_interval
avg_interval = (num_frames - ori_clip_len + 1) / float(self.num_clips)
if num_frames > ori_clip_len - 1:
base_offsets = np.arange(self.num_clips) * avg_interval
clip_offsets = (base_offsets + avg_interval / 2.0).astype(np.int64)
if self.twice_sample:
clip_offsets = np.concatenate([clip_offsets, base_offsets])
else:
clip_offsets = np.zeros((self.num_clips, ), dtype=np.int64)
return clip_offsets
def _sample_clips(self, num_frames):
"""Choose clip offsets for the video in a given mode.
Args:
num_frames (int): Total number of frame in the video.
Returns:
np.ndarray: Sampled frame indices.
"""
if self.test_mode:
clip_offsets = self._get_test_clips(num_frames)
else:
clip_offsets = self._get_train_clips(num_frames)
return clip_offsets
def get_seq_frames(self, num_frames):
"""
Modified from https://github.com/facebookresearch/SlowFast/blob/64abcc90ccfdcbb11cf91d6e525bed60e92a8796/slowfast/datasets/ssv2.py#L159
Given the video index, return the list of sampled frame indexes.
Args:
num_frames (int): Total number of frame in the video.
Returns:
seq (list): the indexes of frames of sampled from the video.
"""
seg_size = float(num_frames - 1) / self.clip_len
seq = []
for i in range(self.clip_len):
start = int(np.round(seg_size * i))
end = int(np.round(seg_size * (i + 1)))
if not self.test_mode:
seq.append(random.randint(start, end))
else:
seq.append((start + end) // 2)
return np.array(seq)
def __call__(self, results):
"""Perform the SampleFrames loading.
Args:
results (dict): The resulting dict to be modified and passed
to the next transform in pipeline.
"""
total_frames = results['total_frames']
if self.frame_uniform: # sthv2 sampling strategy
assert results['start_index'] == 0
frame_inds = self.get_seq_frames(total_frames)
else:
clip_offsets = self._sample_clips(total_frames)
frame_inds = clip_offsets[:, None] + np.arange(
self.clip_len)[None, :] * self.frame_interval
frame_inds = np.concatenate(frame_inds)
if self.temporal_jitter:
perframe_offsets = np.random.randint(
self.frame_interval, size=len(frame_inds))
frame_inds += perframe_offsets
frame_inds = frame_inds.reshape((-1, self.clip_len))
if self.out_of_bound_opt == 'loop':
frame_inds = np.mod(frame_inds, total_frames)
elif self.out_of_bound_opt == 'repeat_last':
safe_inds = frame_inds < total_frames
unsafe_inds = 1 - safe_inds
last_ind = np.max(safe_inds * frame_inds, axis=1)
new_inds = (
safe_inds * frame_inds + (unsafe_inds.T * last_ind).T)
frame_inds = new_inds
else:
raise ValueError('Illegal out_of_bound option.')
start_index = results['start_index']
frame_inds = np.concatenate(frame_inds) + start_index
results['frame_inds'] = frame_inds.astype(np.int64)
results['clip_len'] = self.clip_len
results['frame_interval'] = self.frame_interval
results['num_clips'] = self.num_clips
return results
def __repr__(self):
repr_str = (f'{self.__class__.__name__}('
f'clip_len={self.clip_len}, '
f'frame_interval={self.frame_interval}, '
f'num_clips={self.num_clips}, '
f'temporal_jitter={self.temporal_jitter}, '
f'twice_sample={self.twice_sample}, '
f'out_of_bound_opt={self.out_of_bound_opt}, '
f'test_mode={self.test_mode})')
return repr_str
@PIPELINES.register_module()
class VideoToTensor:
"""Convert some values in results dict to `torch.Tensor` type in data
loader pipeline.
Args:
keys (Sequence[str]): Required keys to be converted.
"""
def __init__(self, keys):
self.keys = keys
def __call__(self, results):
"""Performs the ToTensor formating.
Args:
results (dict): The resulting dict to be modified and passed
to the next transform in pipeline.
"""
for key in self.keys:
results[key] = to_tensor(results[key])
return results
def __repr__(self):
return f'{self.__class__.__name__}(keys={self.keys})'
@PIPELINES.register_module()
class FormatShape:
"""Format final imgs shape to the given input_format.
Required keys are "imgs", "num_clips" and "clip_len", added or modified
keys are "imgs" and "input_shape".
Args:
input_format (str): Define the final imgs format.
collapse (bool): To collpase input_format N... to ... (NCTHW to CTHW,
etc.) if N is 1. Should be set as True when training and testing
detectors. Default: False.
"""
def __init__(self, input_format, collapse=False):
self.input_format = input_format
self.collapse = collapse
if self.input_format not in ['NCTHW', 'NCHW', 'NCHW_Flow', 'NPTCHW']:
raise ValueError(
f'The input format {self.input_format} is invalid.')
def __call__(self, results):
"""Performs the FormatShape formating.
Args:
results (dict): The resulting dict to be modified and passed
to the next transform in pipeline.
"""
if not isinstance(results['imgs'], np.ndarray):
results['imgs'] = np.array(results['imgs'])
imgs = results['imgs']
# [M x H x W x C]
# M = 1 * N_crops * N_clips * L
if self.collapse:
assert results['num_clips'] == 1
if self.input_format == 'NCTHW':
num_clips = results['num_clips']
clip_len = results['clip_len']
imgs = imgs.reshape((-1, num_clips, clip_len) + imgs.shape[1:])
# N_crops x N_clips x L x H x W x C
imgs = np.transpose(imgs, (0, 1, 5, 2, 3, 4))
# N_crops x N_clips x C x L x H x W
imgs = imgs.reshape((-1, ) + imgs.shape[2:])
# M' x C x L x H x W
# M' = N_crops x N_clips
elif self.input_format == 'NCHW':
imgs = np.transpose(imgs, (0, 3, 1, 2))
# M x C x H x W
elif self.input_format == 'NCHW_Flow':
num_clips = results['num_clips']
clip_len = results['clip_len']
imgs = imgs.reshape((-1, num_clips, clip_len) + imgs.shape[1:])
# N_crops x N_clips x L x H x W x C
imgs = np.transpose(imgs, (0, 1, 2, 5, 3, 4))
# N_crops x N_clips x L x C x H x W
imgs = imgs.reshape((-1, imgs.shape[2] * imgs.shape[3]) +
imgs.shape[4:])
# M' x C' x H x W
# M' = N_crops x N_clips
# C' = L x C
elif self.input_format == 'NPTCHW':
num_proposals = results['num_proposals']
num_clips = results['num_clips']
clip_len = results['clip_len']
imgs = imgs.reshape((num_proposals, num_clips * clip_len) +
imgs.shape[1:])
# P x M x H x W x C
# M = N_clips x L
imgs = np.transpose(imgs, (0, 1, 4, 2, 3))
# P x M x C x H x W
if self.collapse:
assert imgs.shape[0] == 1
imgs = imgs.squeeze(0)
results['imgs'] = imgs
results['input_shape'] = imgs.shape
return results
def __repr__(self):
repr_str = self.__class__.__name__
repr_str += f"(input_format='{self.input_format}')"
return