DINOv/datasets/dataset_mappers/ytvos_dataset_mapper.py

169 lines
6.5 KiB
Python

# Copyright (c) Facebook, Inc. and its affiliates.
# Modified by Bowen Cheng from https://github.com/facebookresearch/detr/blob/master/d2/detr/dataset_mapper.py
import copy
import os
import cv2
import scipy.io
import numpy as np
from scipy.io import loadmat
from PIL import Image
import torch
from torchvision import transforms
from torch.utils.data.dataset import Dataset
from detectron2.structures import BitMasks, Boxes, Instances
from ..shapes import build_shape_sampler
from detectron2.config import configurable
__all__ = ["YTVOSDatasetMapper"]
class VideoReader(object):
"""
This class is used to read a video, one frame at a time
"""
def __init__(self, image_dir, mask_dir, objects, min_size=None, max_size=None):
"""
image_dir - points to a directory of jpg images
mask_dir - points to a directory of png masks
size - resize min. side to size. Does nothing if <0.
to_save - optionally contains a list of file names without extensions
where the segmentation mask is required
use_all_mask - when true, read all available mask in mask_dir.
Default false. Set to true for YouTubeVOS validation.
"""
self.image_dir = image_dir
self.mask_dir = mask_dir
self.use_all_mask = True
self.vid_name = os.path.basename(image_dir)
self.frames = sorted(os.listdir(self.image_dir))
self.palette = Image.open(os.path.join(mask_dir, sorted(os.listdir(mask_dir))[0])).getpalette()
self.first_gt_path = os.path.join(self.mask_dir, sorted(os.listdir(self.mask_dir))[0])
self.object_ids = [int(x) for x in list(objects.keys())]
self.object_id_to_start_frame = {key: os.path.join(mask_dir, "{}.png".format(objects[key]['frames'][0])) for key in objects.keys()}
self.object_id_to_end_frame = {key: os.path.join(mask_dir, "{}.png".format(objects[key]['frames'][-1])) for key in objects.keys()}
self.start_frames = self.object_id_to_start_frame.values()
self.end_frames = self.object_id_to_end_frame.values()
self.mappers = {idx:int(x) for idx,x in enumerate(self.object_ids)}
t = []
t.append(transforms.Resize(min_size, interpolation=Image.BICUBIC, max_size=max_size))
self.transform = transforms.Compose(t)
def __getitem__(self, idx):
dataset_dict = {}
frame = self.frames[idx]
im_path = os.path.join(self.image_dir, frame)
image = Image.open(im_path).convert('RGB')
dataset_dict['width'] = image.size[0]
dataset_dict['height'] = image.size[1]
image = self.transform(image)
image = torch.from_numpy(np.asarray(image).copy())
image = image.permute(2,0,1)
gt_path = os.path.join(self.mask_dir, '{}.png'.format(frame[:-4]))
key_frames = torch.zeros(len(self.object_ids)).bool()
end_frames = torch.zeros(len(self.object_ids)).bool()
if os.path.exists(gt_path):
mask = Image.open(gt_path).convert('P')
mask = np.array(mask, dtype=np.uint8)
object_masks = []
for idx in self.object_ids:
object_masks += [mask==idx]
instances = Instances(image.shape[-2:])
_,h,w = image.shape
# sbd dataset only has one gt mask.
masks = [cv2.resize(object_mask.astype(np.uint8), (w,h), interpolation=cv2.INTER_CUBIC) for object_mask in object_masks]
masks = BitMasks(
torch.stack([torch.from_numpy(np.ascontiguousarray(x.copy())) for x in masks])
)
instances.gt_masks = masks
instances.gt_boxes = masks.get_bounding_boxes()
dataset_dict['instances'] = instances
dataset_dict['gt_masks_orisize'] = torch.stack([torch.from_numpy(object_mask) for object_mask in object_masks])
if gt_path in self.start_frames:
for index, obj_id in enumerate(self.object_ids):
if gt_path == self.object_id_to_start_frame[str(obj_id)]:
key_frames[index] = True
if gt_path in self.end_frames:
for index, obj_id in enumerate(self.object_ids):
if gt_path == self.object_id_to_end_frame[str(obj_id)]:
end_frames[index] = True
dataset_dict['image'] = image
dataset_dict['key_frame'] = key_frames
dataset_dict['frame_id'] = frame.split('/')[-1].split('.')[0]
dataset_dict['end_frame'] = end_frames
return dataset_dict
def __len__(self):
return len(self.frames)
# This is specifically designed for the COCO dataset.
class YTVOSDatasetMapper:
"""
A callable which takes a dataset dict in Detectron2 Dataset format,
and map it into a format used by MaskFormer.
This dataset mapper applies the same transformation as DETR for COCO panoptic segmentation.
The callable currently does the following:
1. Read the image from "file_name"
2. Applies geometric transforms to the image and annotation
3. Find and applies suitable cropping to the image and annotation
4. Prepare image and annotation to Tensors
"""
@configurable
def __init__(
self,
is_train=True,
dataset_name='',
min_size_test=None,
max_size_test=None,
):
"""
NOTE: this interface is experimental.
Args:
is_train: for training or inference
augmentations: a list of augmentations or deterministic transforms to apply
tfm_gens: data augmentation
image_format: an image format supported by :func:`detection_utils.read_image`.
"""
self.is_train = is_train
self.dataset_name = dataset_name
self.min_size_test = min_size_test
self.max_size_test = max_size_test
@classmethod
def from_config(cls, cfg, is_train=True, dataset_name=''):
ret = {
"is_train": is_train,
"dataset_name": dataset_name,
"min_size_test": cfg['INPUT']['MIN_SIZE_TEST'],
"max_size_test": cfg['INPUT']['MAX_SIZE_TEST'],
}
return ret
def __call__(self, dataset_dict):
"""
Args:
dataset_dict (dict): Metadata of one image, in Detectron2 Dataset format.
Returns:
dict: a format that builtin models in detectron2 accept
"""
return VideoReader(dataset_dict['file_name'], dataset_dict['mask_name'], dataset_dict['objects'], min_size=self.min_size_test, max_size=self.max_size_test)