OWOD/detectron2/data/datasets/voc_style_coco.py

123 lines
4.5 KiB
Python

# -*- coding: utf-8 -*-
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
import itertools
import numpy as np
import os
import xml.etree.ElementTree as ET
from typing import List, Tuple, Union
from fvcore.common.file_io import PathManager
from detectron2.data import DatasetCatalog, MetadataCatalog
from detectron2.structures import BoxMode
__all__ = ["load_voc_coco_instances", "register_voc_style_coco"]
VOC_CLASS_NAMES = [
"aeroplane", "bicycle", "bird", "boat", "bottle", "bus", "car", "cat",
"chair", "cow", "diningtable", "dog", "horse", "motorbike", "person",
"pottedplant", "sheep", "sofa", "train", "tvmonitor"
]
T2_CLASS_NAMES = [
"truck", "traffic light", "fire hydrant", "stop sign", "parking meter",
"bench", "elephant", "bear", "zebra", "giraffe",
"backpack", "umbrella", "handbag", "tie", "suitcase",
"microwave", "oven", "toaster", "sink", "refrigerator"
]
T3_CLASS_NAMES = [
"frisbee", "skis", "snowboard", "sports ball", "kite",
"baseball bat", "baseball glove", "skateboard", "surfboard", "tennis racket",
"banana", "apple", "sandwich", "orange", "broccoli",
"carrot", "hot dog", "pizza", "donut", "cake"
]
T4_CLASS_NAMES = [
"bed", "toilet", "laptop", "mouse",
"remote", "keyboard", "cell phone", "book", "clock",
"vase", "scissors", "teddy bear", "hair drier", "toothbrush",
"wine glass", "cup", "fork", "knife", "spoon", "bowl"
]
UNK_CLASS = ["unknown"]
INCR_CLASS_NAMES = itertools.chain(VOC_CLASS_NAMES, T2_CLASS_NAMES, T3_CLASS_NAMES, T4_CLASS_NAMES, UNK_CLASS)
INCR_CLASS_NAMES = tuple(INCR_CLASS_NAMES)
def load_voc_coco_instances(dirname: str, split: str, class_names: Union[List[str], Tuple[str, ...]]):
"""
Load Pascal VOC detection annotations to Detectron2 format.
Args:
dirname: Contain "Annotations", "ImageSets", "JPEGImages"
split (str): one of "train", "test", "val", "trainval": t1_train, t1_test
class_names: list or tuple of class names
"""
with PathManager.open(os.path.join(dirname, "ImageSets", "Main", split + ".txt")) as f:
fileids = np.loadtxt(f, dtype=np.str)
known_class_list = None
if 't2' in split:
known_class_list = T2_CLASS_NAMES
elif 't3' in split:
known_class_list = T3_CLASS_NAMES
elif 't4' in split:
known_class_list = T4_CLASS_NAMES
# Needs to read many small annotation files. Makes sense at local
annotation_dirname = PathManager.get_local_path(os.path.join(dirname, "Annotations/"))
dicts = []
for fileid in fileids:
anno_file = os.path.join(annotation_dirname, fileid + ".xml")
jpeg_file = os.path.join(dirname, "JPEGImages", fileid + ".jpg")
with PathManager.open(anno_file) as f:
tree = ET.parse(f)
r = {
"file_name": jpeg_file,
"image_id": fileid,
"height": int(tree.findall("./size/height")[0].text),
"width": int(tree.findall("./size/width")[0].text),
}
instances = []
for obj in tree.findall("object"):
cls_name = obj.find("name").text
if cls_name not in known_class_list:
continue
if 'unk' in split:
cls = "unknown"
else:
cls = cls_name
# We include "difficult" samples in training.
# Based on limited experiments, they don't hurt accuracy.
# difficult = int(obj.find("difficult").text)
# if difficult == 1:
# continue
bbox = obj.find("bndbox")
bbox = [float(bbox.find(x).text) for x in ["xmin", "ymin", "xmax", "ymax"]]
# Original annotations are integers in the range [1, W or H]
# Assuming they mean 1-based pixel indices (inclusive),
# a box with annotation (xmin=1, xmax=W) covers the whole image.
# In coordinate space this is represented by (xmin=0, xmax=W)
bbox[0] -= 1.0
bbox[1] -= 1.0
instances.append(
{"category_id": class_names.index(cls), "bbox": bbox, "bbox_mode": BoxMode.XYXY_ABS}
)
r["annotations"] = instances
dicts.append(r)
return dicts
def register_voc_style_coco(name, dirname, split, year, class_names=INCR_CLASS_NAMES):
DatasetCatalog.register(name, lambda: load_voc_coco_instances(dirname, split, class_names))
MetadataCatalog.get(name).set(
thing_classes=list(class_names), dirname=dirname, year=year, split=split
)