# Copyright (c) Alibaba, Inc. and its affiliates. import os import unittest import torch from tests.ut_config import VIDEO_DATA_SMALL_RAW_LOCAL from easycv.core.evaluation.builder import build_evaluator from easycv.datasets.builder import build_datasource from easycv.datasets.video_recognition.raw import VideoDataset class VideoDatasetTest(unittest.TestCase): def setUp(self): print(('Testing %s.%s' % (type(self).__name__, self._testMethodName))) def test_default(self): data_root = VIDEO_DATA_SMALL_RAW_LOCAL data_source_cfg = dict( type='VideoDatasource', ann_file=os.path.join(data_root, 'kinetics400/test.txt'), data_root=data_root, split=' ', ) img_norm_cfg = dict( mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_bgr=False) pipeline = [ dict(type='DecordInit'), dict( type='SampleFrames', clip_len=32, frame_interval=2, num_clips=1), dict(type='DecordDecode'), dict(type='VideoResize', scale=(-1, 256)), dict(type='VideoRandomResizedCrop'), dict(type='VideoResize', scale=(224, 224), keep_ratio=False), dict(type='VideoFlip', flip_ratio=0.5), dict(type='VideoNormalize', **img_norm_cfg), dict(type='FormatShape', input_format='NCTHW'), dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]), dict(type='VideoToTensor', keys=['imgs', 'label']) ] dataset = VideoDataset(data_source_cfg, pipeline) item = dataset[10] self.assertEqual(item['imgs'].shape, torch.Size([1, 3, 32, 224, 224])) @unittest.skip('skipping') def test_video_text(self): data_root = VIDEO_DATA_SMALL_RAW_LOCAL data_source_cfg = dict( type='VideoTextDatasource', ann_file=os.path.join(data_root, 'video_text/test.txt'), data_root=data_root + '/video_text/video', ) img_norm_cfg = dict( mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_bgr=False) pipeline = [ dict(type='DecordInit'), dict( type='SampleFrames', clip_len=32, frame_interval=2, num_clips=1), dict(type='DecordDecode'), dict(type='VideoResize', scale=(-1, 256)), dict(type='VideoRandomResizedCrop'), dict(type='VideoResize', scale=(224, 224), keep_ratio=False), dict(type='VideoFlip', flip_ratio=0.5), dict(type='VideoNormalize', **img_norm_cfg), dict(type='TextTokenizer'), dict(type='FormatShape', input_format='NCTHW'), dict( type='Collect', keys=['imgs', 'label', 'text_input_ids', 'text_input_mask'], meta_keys=[]), dict(type='VideoToTensor', keys=['imgs', 'label']) ] dataset = VideoDataset(data_source_cfg, pipeline) item = dataset[5] self.assertEqual(item['imgs'].shape, torch.Size([1, 3, 32, 224, 224])) if __name__ == '__main__': unittest.main()