# Copyright (c) Facebook, Inc. and its affiliates. # Modified by Bowen Cheng from https://github.com/facebookresearch/detr/blob/master/d2/detr/dataset_mapper.py import copy import scipy.io import numpy as np import torch from PIL import Image from torchvision import transforms from detectron2.config import configurable from detectron2.data import detection_utils as utils from detectron2.structures import BitMasks, Boxes, Instances from detectron2.data import transforms as T __all__ = ["PascalContextSegDatasetMapper"] # This is specifically designed for the COCO dataset. class PascalContextSegDatasetMapper_ori: """ A callable which takes a dataset dict in Detectron2 Dataset format, and map it into a format used by MaskFormer. This dataset mapper applies the same transformation as DETR for COCO panoptic segmentation. The callable currently does the following: 1. Read the image from "file_name" 2. Applies geometric transforms to the image and annotation 3. Find and applies suitable cropping to the image and annotation 4. Prepare image and annotation to Tensors """ @configurable def __init__( self, is_train=True, min_size_test=None, max_size_test=None, mean=None, std=None, ): """ NOTE: this interface is experimental. Args: is_train: for training or inference augmentations: a list of augmentations or deterministic transforms to apply tfm_gens: data augmentation image_format: an image format supported by :func:`detection_utils.read_image`. """ self.is_train = is_train self.min_size_test = min_size_test self.max_size_test = max_size_test self.pixel_mean = torch.tensor(mean)[:, None, None] self.pixel_std = torch.tensor(std)[:, None, None] t = [] t.append(transforms.Resize(self.min_size_test, interpolation=Image.BICUBIC)) # HACK for ViT evaluation # t.append(transforms.Resize([self.min_size_test, self.min_size_test], interpolation=Image.BICUBIC)) self.transform = transforms.Compose(t) @classmethod def from_config(cls, cfg, is_train=True): ret = { "is_train": is_train, "min_size_test": cfg['INPUT']['MIN_SIZE_TEST'], "max_size_test": cfg['INPUT']['MAX_SIZE_TEST'], "mean": cfg['INPUT']['PIXEL_MEAN'], "std": cfg['INPUT']['PIXEL_STD'], } return ret def read_semseg(self, file_name): if '.png' in file_name: semseg = np.asarray(Image.open(file_name)) elif '.mat' in file_name: semseg = scipy.io.loadmat(file_name)['LabelMap'] return semseg def __call__(self, dataset_dict): """ Args: dataset_dict (dict): Metadata of one image, in Detectron2 Dataset format. Returns: dict: a format that builtin models in detectron2 accept """ dataset_dict = copy.deepcopy(dataset_dict) # it will be modified by code below file_name = dataset_dict['file_name'] semseg_name = dataset_dict['sem_seg_file_name'] image = Image.open(file_name).convert('RGB') dataset_dict['width'] = image.size[0] dataset_dict['height'] = image.size[1] if self.is_train == False: image = self.transform(image) image = torch.from_numpy(np.asarray(image).copy()) image = image.permute(2, 0, 1) semseg = self.read_semseg(semseg_name) semseg = torch.from_numpy(semseg.astype(np.int32)) dataset_dict['image'] = image dataset_dict['semseg'] = semseg return dataset_dict class PascalContextSegDatasetMapper: """ A callable which takes a dataset dict in Detectron2 Dataset format, and map it into a format used by MaskFormer. This dataset mapper applies the same transformation as DETR for COCO panoptic segmentation. The callable currently does the following: 1. Read the image from "file_name" 2. Applies geometric transforms to the image and annotation 3. Find and applies suitable cropping to the image and annotation 4. Prepare image and annotation to Tensors """ @configurable def __init__( self, is_train=True, augmentations=None, min_size_test=None, max_size_test=None, mean=None, std=None, ): """ NOTE: this interface is experimental. Args: is_train: for training or inference augmentations: a list of augmentations or deterministic transforms to apply tfm_gens: data augmentation image_format: an image format supported by :func:`detection_utils.read_image`. """ self.is_train = is_train self.min_size_test = min_size_test self.max_size_test = max_size_test self.pixel_mean = torch.tensor(mean)[:, None, None] self.pixel_std = torch.tensor(std)[:, None, None] t = [] t.append(transforms.Resize(self.min_size_test, interpolation=Image.BICUBIC)) # HACK for ViT evaluation # t.append(transforms.Resize([self.min_size_test, self.min_size_test], interpolation=Image.BICUBIC)) self.transform = transforms.Compose(t) self.augmentations = T.AugmentationList(augmentations) self.ignore_label = 0 @classmethod def from_config(cls, cfg, is_train=True): augs = utils.build_augmentation(cfg, is_train) ret = { "is_train": is_train, "augmentations": augs, "min_size_test": cfg['INPUT']['MIN_SIZE_TEST'], "max_size_test": cfg['INPUT']['MAX_SIZE_TEST'], "mean": cfg['INPUT']['PIXEL_MEAN'], "std": cfg['INPUT']['PIXEL_STD'], } return ret def read_semseg(self, file_name): if '.png' in file_name: semseg = np.asarray(Image.open(file_name)) elif '.mat' in file_name: semseg = scipy.io.loadmat(file_name)['LabelMap'] return semseg def __call__(self, dataset_dict): """ Args: dataset_dict (dict): Metadata of one image, in Detectron2 Dataset format. Returns: dict: a format that builtin models in detectron2 accept """ dataset_dict = copy.deepcopy(dataset_dict) # it will be modified by code below # USER: Write your own image loading if it's not from a file image = utils.read_image(dataset_dict["file_name"], format='RGB') utils.check_image_size(dataset_dict, image) # USER: Remove if you don't do semantic/panoptic segmentation. semseg_name = None if "sem_seg_file_name" in dataset_dict: semseg_name = dataset_dict.pop("sem_seg_file_name") if semseg_name.split('.')[-1]!='mat': sem_seg_gt = utils.read_image(semseg_name, "L").squeeze(2) else: sem_seg_gt = self.read_semseg(semseg_name) sem_seg_gt = sem_seg_gt.astype(np.uint8) else: sem_seg_gt = None aug_input = T.AugInput(image, sem_seg=sem_seg_gt) transforms = self.augmentations(aug_input) image, sem_seg_gt = aug_input.image, aug_input.sem_seg image_shape = image.shape[:2] # h, w # Pytorch's dataloader is efficient on torch.Tensor due to shared-memory, # but not efficient on large generic data structures due to the use of pickle & mp.Queue. # Therefore it's important to use torch.Tensor. dataset_dict["image"] = torch.as_tensor(np.ascontiguousarray(image.transpose(2, 0, 1))) if sem_seg_gt is not None: sem_seg_gt = torch.as_tensor(sem_seg_gt.astype("long")) # if self.size_divisibility > 0: # image_size = (image.shape[-2], image.shape[-1]) # padding_size = [ # 0, # self.size_divisibility - image_size[1], # 0, # self.size_divisibility - image_size[0], # ] # image = F.pad(image, padding_size, value=128).contiguous() # if sem_seg_gt is not None: # sem_seg_gt = F.pad(sem_seg_gt, padding_size, value=self.ignore_label).contiguous() if "annotations" in dataset_dict: raise ValueError("Semantic segmentation dataset should not have 'annotations'.") # Prepare per-category binary masks if sem_seg_gt is not None: # sem_seg_gt = transforms.apply_segmentation(sem_seg_gt) ### sem_seg_gt = sem_seg_gt.numpy() instances = Instances(image_shape) classes = np.unique(sem_seg_gt) # remove ignored region # TODO: this is a hack for datasets with backgorund as 0, which is meaningless classes = classes[classes != self.ignore_label] - 1 # print("semseg_name, classes ", semseg_name, classes) instances.gt_classes = torch.tensor(classes, dtype=torch.int64) masks = [] for class_id in classes: masks.append(sem_seg_gt == class_id) if len(masks) == 0: # Some image does not have annotation (all ignored) instances.gt_masks = torch.zeros((0, sem_seg_gt.shape[-2], sem_seg_gt.shape[-1])) else: masks = BitMasks( torch.stack([torch.from_numpy(np.ascontiguousarray(x.copy())) for x in masks]) ) instances.gt_masks = masks.tensor instances.gt_boxes = masks.get_bounding_boxes() dataset_dict["instances"] = instances ####### # dataset_dict = copy.deepcopy(dataset_dict) # it will be modified by code below # file_name = dataset_dict['file_name'] # semseg_name = dataset_dict['sem_seg_file_name'] # image = Image.open(file_name).convert('RGB') # # dataset_dict['width'] = image.size[0] # dataset_dict['height'] = image.size[1] # # if self.is_train == False: # image = self.transform(image) # image = torch.from_numpy(np.asarray(image).copy()) # image = image.permute(2, 0, 1) # # semseg = self.read_semseg(semseg_name) # semseg = torch.from_numpy(semseg.astype(np.int32)) # # dataset_dict['image'] = image # dataset_dict['semseg'] = semseg return dataset_dict