mirror of https://github.com/RE-OWOD/RE-OWOD
Add files via upload
parent
3c9dcce2c7
commit
28e2825941
|
@ -1 +1,54 @@
|
|||
# DensePose in Detectron2
|
||||
**Dense Human Pose Estimation In The Wild**
|
||||
|
||||
_Rıza Alp Güler, Natalia Neverova, Iasonas Kokkinos_
|
||||
|
||||
[[`densepose.org`](https://densepose.org)] [[`arXiv`](https://arxiv.org/abs/1802.00434)] [[`BibTeX`](#CitingDensePose)]
|
||||
|
||||
Dense human pose estimation aims at mapping all human pixels of an RGB image to the 3D surface of the human body.
|
||||
|
||||
<div align="center">
|
||||
<img src="https://drive.google.com/uc?export=view&id=1qfSOkpueo1kVZbXOuQJJhyagKjMgepsz" width="700px" />
|
||||
</div>
|
||||
|
||||
In this repository, we provide the code to train and evaluate DensePose-RCNN. We also provide tools to visualize
|
||||
DensePose annotation and results.
|
||||
|
||||
# Quick Start
|
||||
|
||||
See [ Getting Started ](doc/GETTING_STARTED.md)
|
||||
|
||||
# Model Zoo and Baselines
|
||||
|
||||
We provide a number of baseline results and trained models available for download. See [Model Zoo](doc/MODEL_ZOO.md) for details.
|
||||
|
||||
# License
|
||||
|
||||
Detectron2 is released under the [Apache 2.0 license](../../LICENSE)
|
||||
|
||||
## <a name="CitingDensePose"></a>Citing DensePose
|
||||
|
||||
If you use DensePose, please take the references from the following BibTeX entries:
|
||||
|
||||
For DensePose with estimated confidences:
|
||||
|
||||
```
|
||||
@InProceedings{Neverova2019DensePoseConfidences,
|
||||
title = {Correlated Uncertainty for Learning Dense Correspondences from Noisy Labels},
|
||||
author = {Neverova, Natalia and Novotny, David and Vedaldi, Andrea},
|
||||
journal = {Advances in Neural Information Processing Systems},
|
||||
year = {2019},
|
||||
}
|
||||
```
|
||||
|
||||
For the original DensePose:
|
||||
|
||||
```
|
||||
@InProceedings{Guler2018DensePose,
|
||||
title={DensePose: Dense Human Pose Estimation In The Wild},
|
||||
author={R\{i}za Alp G\"uler, Natalia Neverova, Iasonas Kokkinos},
|
||||
journal={The IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
|
||||
year={2018}
|
||||
}
|
||||
```
|
||||
|
||||
|
|
|
@ -0,0 +1,319 @@
|
|||
#!/usr/bin/env python3
|
||||
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
|
||||
|
||||
import argparse
|
||||
import glob
|
||||
import logging
|
||||
import os
|
||||
import pickle
|
||||
import sys
|
||||
from typing import Any, ClassVar, Dict, List
|
||||
import torch
|
||||
|
||||
from detectron2.config import get_cfg
|
||||
from detectron2.data.detection_utils import read_image
|
||||
from detectron2.engine.defaults import DefaultPredictor
|
||||
from detectron2.structures.boxes import BoxMode
|
||||
from detectron2.structures.instances import Instances
|
||||
from detectron2.utils.logger import setup_logger
|
||||
|
||||
from densepose import add_densepose_config, add_hrnet_config
|
||||
from densepose.utils.logger import verbosity_to_level
|
||||
from densepose.vis.base import CompoundVisualizer
|
||||
from densepose.vis.bounding_box import ScoredBoundingBoxVisualizer
|
||||
from densepose.vis.densepose import (
|
||||
DensePoseResultsContourVisualizer,
|
||||
DensePoseResultsFineSegmentationVisualizer,
|
||||
DensePoseResultsUVisualizer,
|
||||
DensePoseResultsVVisualizer,
|
||||
)
|
||||
from densepose.vis.extractor import CompoundExtractor, create_extractor
|
||||
|
||||
DOC = """Apply Net - a tool to print / visualize DensePose results
|
||||
"""
|
||||
|
||||
LOGGER_NAME = "apply_net"
|
||||
logger = logging.getLogger(LOGGER_NAME)
|
||||
|
||||
_ACTION_REGISTRY: Dict[str, "Action"] = {}
|
||||
|
||||
|
||||
class Action(object):
|
||||
@classmethod
|
||||
def add_arguments(cls: type, parser: argparse.ArgumentParser):
|
||||
parser.add_argument(
|
||||
"-v",
|
||||
"--verbosity",
|
||||
action="count",
|
||||
help="Verbose mode. Multiple -v options increase the verbosity.",
|
||||
)
|
||||
|
||||
|
||||
def register_action(cls: type):
|
||||
"""
|
||||
Decorator for action classes to automate action registration
|
||||
"""
|
||||
global _ACTION_REGISTRY
|
||||
_ACTION_REGISTRY[cls.COMMAND] = cls
|
||||
return cls
|
||||
|
||||
|
||||
class InferenceAction(Action):
|
||||
@classmethod
|
||||
def add_arguments(cls: type, parser: argparse.ArgumentParser):
|
||||
super(InferenceAction, cls).add_arguments(parser)
|
||||
parser.add_argument("cfg", metavar="<config>", help="Config file")
|
||||
parser.add_argument("model", metavar="<model>", help="Model file")
|
||||
parser.add_argument("input", metavar="<input>", help="Input data")
|
||||
parser.add_argument(
|
||||
"--opts",
|
||||
help="Modify config options using the command-line 'KEY VALUE' pairs",
|
||||
default=[],
|
||||
nargs=argparse.REMAINDER,
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def execute(cls: type, args: argparse.Namespace):
|
||||
logger.info(f"Loading config from {args.cfg}")
|
||||
opts = []
|
||||
cfg = cls.setup_config(args.cfg, args.model, args, opts)
|
||||
logger.info(f"Loading model from {args.model}")
|
||||
predictor = DefaultPredictor(cfg)
|
||||
logger.info(f"Loading data from {args.input}")
|
||||
file_list = cls._get_input_file_list(args.input)
|
||||
if len(file_list) == 0:
|
||||
logger.warning(f"No input images for {args.input}")
|
||||
return
|
||||
context = cls.create_context(args)
|
||||
for file_name in file_list:
|
||||
img = read_image(file_name, format="BGR") # predictor expects BGR image.
|
||||
with torch.no_grad():
|
||||
outputs = predictor(img)["instances"]
|
||||
cls.execute_on_outputs(context, {"file_name": file_name, "image": img}, outputs)
|
||||
cls.postexecute(context)
|
||||
|
||||
@classmethod
|
||||
def setup_config(
|
||||
cls: type, config_fpath: str, model_fpath: str, args: argparse.Namespace, opts: List[str]
|
||||
):
|
||||
cfg = get_cfg()
|
||||
add_densepose_config(cfg)
|
||||
add_hrnet_config(cfg)
|
||||
cfg.merge_from_file(config_fpath)
|
||||
cfg.merge_from_list(args.opts)
|
||||
if opts:
|
||||
cfg.merge_from_list(opts)
|
||||
cfg.MODEL.WEIGHTS = model_fpath
|
||||
cfg.freeze()
|
||||
return cfg
|
||||
|
||||
@classmethod
|
||||
def _get_input_file_list(cls: type, input_spec: str):
|
||||
if os.path.isdir(input_spec):
|
||||
file_list = [
|
||||
os.path.join(input_spec, fname)
|
||||
for fname in os.listdir(input_spec)
|
||||
if os.path.isfile(os.path.join(input_spec, fname))
|
||||
]
|
||||
elif os.path.isfile(input_spec):
|
||||
file_list = [input_spec]
|
||||
else:
|
||||
file_list = glob.glob(input_spec)
|
||||
return file_list
|
||||
|
||||
|
||||
@register_action
|
||||
class DumpAction(InferenceAction):
|
||||
"""
|
||||
Dump action that outputs results to a pickle file
|
||||
"""
|
||||
|
||||
COMMAND: ClassVar[str] = "dump"
|
||||
|
||||
@classmethod
|
||||
def add_parser(cls: type, subparsers: argparse._SubParsersAction):
|
||||
parser = subparsers.add_parser(cls.COMMAND, help="Dump model outputs to a file.")
|
||||
cls.add_arguments(parser)
|
||||
parser.set_defaults(func=cls.execute)
|
||||
|
||||
@classmethod
|
||||
def add_arguments(cls: type, parser: argparse.ArgumentParser):
|
||||
super(DumpAction, cls).add_arguments(parser)
|
||||
parser.add_argument(
|
||||
"--output",
|
||||
metavar="<dump_file>",
|
||||
default="results.pkl",
|
||||
help="File name to save dump to",
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def execute_on_outputs(
|
||||
cls: type, context: Dict[str, Any], entry: Dict[str, Any], outputs: Instances
|
||||
):
|
||||
image_fpath = entry["file_name"]
|
||||
logger.info(f"Processing {image_fpath}")
|
||||
result = {"file_name": image_fpath}
|
||||
if outputs.has("scores"):
|
||||
result["scores"] = outputs.get("scores").cpu()
|
||||
if outputs.has("pred_boxes"):
|
||||
result["pred_boxes_XYXY"] = outputs.get("pred_boxes").tensor.cpu()
|
||||
if outputs.has("pred_densepose"):
|
||||
boxes_XYWH = BoxMode.convert(
|
||||
result["pred_boxes_XYXY"], BoxMode.XYXY_ABS, BoxMode.XYWH_ABS
|
||||
)
|
||||
result["pred_densepose"] = outputs.get("pred_densepose").to_result(boxes_XYWH)
|
||||
context["results"].append(result)
|
||||
|
||||
@classmethod
|
||||
def create_context(cls: type, args: argparse.Namespace):
|
||||
context = {"results": [], "out_fname": args.output}
|
||||
return context
|
||||
|
||||
@classmethod
|
||||
def postexecute(cls: type, context: Dict[str, Any]):
|
||||
out_fname = context["out_fname"]
|
||||
out_dir = os.path.dirname(out_fname)
|
||||
if len(out_dir) > 0 and not os.path.exists(out_dir):
|
||||
os.makedirs(out_dir)
|
||||
with open(out_fname, "wb") as hFile:
|
||||
pickle.dump(context["results"], hFile)
|
||||
logger.info(f"Output saved to {out_fname}")
|
||||
|
||||
|
||||
@register_action
|
||||
class ShowAction(InferenceAction):
|
||||
"""
|
||||
Show action that visualizes selected entries on an image
|
||||
"""
|
||||
|
||||
COMMAND: ClassVar[str] = "show"
|
||||
VISUALIZERS: ClassVar[Dict[str, object]] = {
|
||||
"dp_contour": DensePoseResultsContourVisualizer,
|
||||
"dp_segm": DensePoseResultsFineSegmentationVisualizer,
|
||||
"dp_u": DensePoseResultsUVisualizer,
|
||||
"dp_v": DensePoseResultsVVisualizer,
|
||||
"bbox": ScoredBoundingBoxVisualizer,
|
||||
}
|
||||
|
||||
@classmethod
|
||||
def add_parser(cls: type, subparsers: argparse._SubParsersAction):
|
||||
parser = subparsers.add_parser(cls.COMMAND, help="Visualize selected entries")
|
||||
cls.add_arguments(parser)
|
||||
parser.set_defaults(func=cls.execute)
|
||||
|
||||
@classmethod
|
||||
def add_arguments(cls: type, parser: argparse.ArgumentParser):
|
||||
super(ShowAction, cls).add_arguments(parser)
|
||||
parser.add_argument(
|
||||
"visualizations",
|
||||
metavar="<visualizations>",
|
||||
help="Comma separated list of visualizations, possible values: "
|
||||
"[{}]".format(",".join(sorted(cls.VISUALIZERS.keys()))),
|
||||
)
|
||||
parser.add_argument(
|
||||
"--min_score",
|
||||
metavar="<score>",
|
||||
default=0.8,
|
||||
type=float,
|
||||
help="Minimum detection score to visualize",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--nms_thresh", metavar="<threshold>", default=None, type=float, help="NMS threshold"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output",
|
||||
metavar="<image_file>",
|
||||
default="outputres.png",
|
||||
help="File name to save output to",
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def setup_config(
|
||||
cls: type, config_fpath: str, model_fpath: str, args: argparse.Namespace, opts: List[str]
|
||||
):
|
||||
opts.append("MODEL.ROI_HEADS.SCORE_THRESH_TEST")
|
||||
opts.append(str(args.min_score))
|
||||
if args.nms_thresh is not None:
|
||||
opts.append("MODEL.ROI_HEADS.NMS_THRESH_TEST")
|
||||
opts.append(str(args.nms_thresh))
|
||||
cfg = super(ShowAction, cls).setup_config(config_fpath, model_fpath, args, opts)
|
||||
return cfg
|
||||
|
||||
@classmethod
|
||||
def execute_on_outputs(
|
||||
cls: type, context: Dict[str, Any], entry: Dict[str, Any], outputs: Instances
|
||||
):
|
||||
import cv2
|
||||
import numpy as np
|
||||
|
||||
visualizer = context["visualizer"]
|
||||
extractor = context["extractor"]
|
||||
image_fpath = entry["file_name"]
|
||||
logger.info(f"Processing {image_fpath}")
|
||||
image = cv2.cvtColor(entry["image"], cv2.COLOR_BGR2GRAY)
|
||||
image = np.tile(image[:, :, np.newaxis], [1, 1, 3])
|
||||
data = extractor(outputs)
|
||||
image_vis = visualizer.visualize(image, data)
|
||||
entry_idx = context["entry_idx"] + 1
|
||||
out_fname = cls._get_out_fname(entry_idx, context["out_fname"])
|
||||
out_dir = os.path.dirname(out_fname)
|
||||
if len(out_dir) > 0 and not os.path.exists(out_dir):
|
||||
os.makedirs(out_dir)
|
||||
cv2.imwrite(out_fname, image_vis)
|
||||
logger.info(f"Output saved to {out_fname}")
|
||||
context["entry_idx"] += 1
|
||||
|
||||
@classmethod
|
||||
def postexecute(cls: type, context: Dict[str, Any]):
|
||||
pass
|
||||
|
||||
@classmethod
|
||||
def _get_out_fname(cls: type, entry_idx: int, fname_base: str):
|
||||
base, ext = os.path.splitext(fname_base)
|
||||
return base + ".{0:04d}".format(entry_idx) + ext
|
||||
|
||||
@classmethod
|
||||
def create_context(cls: type, args: argparse.Namespace) -> Dict[str, Any]:
|
||||
vis_specs = args.visualizations.split(",")
|
||||
visualizers = []
|
||||
extractors = []
|
||||
for vis_spec in vis_specs:
|
||||
vis = cls.VISUALIZERS[vis_spec]()
|
||||
visualizers.append(vis)
|
||||
extractor = create_extractor(vis)
|
||||
extractors.append(extractor)
|
||||
visualizer = CompoundVisualizer(visualizers)
|
||||
extractor = CompoundExtractor(extractors)
|
||||
context = {
|
||||
"extractor": extractor,
|
||||
"visualizer": visualizer,
|
||||
"out_fname": args.output,
|
||||
"entry_idx": 0,
|
||||
}
|
||||
return context
|
||||
|
||||
|
||||
def create_argument_parser() -> argparse.ArgumentParser:
|
||||
parser = argparse.ArgumentParser(
|
||||
description=DOC,
|
||||
formatter_class=lambda prog: argparse.HelpFormatter(prog, max_help_position=120),
|
||||
)
|
||||
parser.set_defaults(func=lambda _: parser.print_help(sys.stdout))
|
||||
subparsers = parser.add_subparsers(title="Actions")
|
||||
for _, action in _ACTION_REGISTRY.items():
|
||||
action.add_parser(subparsers)
|
||||
return parser
|
||||
|
||||
|
||||
def main():
|
||||
parser = create_argument_parser()
|
||||
args = parser.parse_args()
|
||||
verbosity = args.verbosity if hasattr(args, "verbosity") else None
|
||||
global logger
|
||||
logger = setup_logger(name=LOGGER_NAME)
|
||||
logger.setLevel(verbosity_to_level(verbosity))
|
||||
args.func(args)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
|
@ -0,0 +1,48 @@
|
|||
VERSION: 2
|
||||
MODEL:
|
||||
META_ARCHITECTURE: "GeneralizedRCNN"
|
||||
BACKBONE:
|
||||
NAME: "build_resnet_fpn_backbone"
|
||||
RESNETS:
|
||||
OUT_FEATURES: ["res2", "res3", "res4", "res5"]
|
||||
FPN:
|
||||
IN_FEATURES: ["res2", "res3", "res4", "res5"]
|
||||
ANCHOR_GENERATOR:
|
||||
SIZES: [[32], [64], [128], [256], [512]] # One size for each in feature map
|
||||
ASPECT_RATIOS: [[0.5, 1.0, 2.0]] # Three aspect ratios (same for all in feature maps)
|
||||
RPN:
|
||||
IN_FEATURES: ["p2", "p3", "p4", "p5", "p6"]
|
||||
PRE_NMS_TOPK_TRAIN: 2000 # Per FPN level
|
||||
PRE_NMS_TOPK_TEST: 1000 # Per FPN level
|
||||
# Detectron1 uses 2000 proposals per-batch,
|
||||
# (See "modeling/rpn/rpn_outputs.py" for details of this legacy issue)
|
||||
# which is approximately 1000 proposals per-image since the default batch size for FPN is 2.
|
||||
POST_NMS_TOPK_TRAIN: 1000
|
||||
POST_NMS_TOPK_TEST: 1000
|
||||
|
||||
DENSEPOSE_ON: True
|
||||
ROI_HEADS:
|
||||
NAME: "DensePoseROIHeads"
|
||||
IN_FEATURES: ["p2", "p3", "p4", "p5"]
|
||||
NUM_CLASSES: 1
|
||||
ROI_BOX_HEAD:
|
||||
NAME: "FastRCNNConvFCHead"
|
||||
NUM_FC: 2
|
||||
POOLER_RESOLUTION: 7
|
||||
POOLER_SAMPLING_RATIO: 2
|
||||
POOLER_TYPE: "ROIAlign"
|
||||
ROI_DENSEPOSE_HEAD:
|
||||
NAME: "DensePoseV1ConvXHead"
|
||||
POOLER_TYPE: "ROIAlign"
|
||||
NUM_COARSE_SEGM_CHANNELS: 2
|
||||
DATASETS:
|
||||
TRAIN: ("densepose_coco_2014_train", "densepose_coco_2014_valminusminival")
|
||||
TEST: ("densepose_coco_2014_minival",)
|
||||
SOLVER:
|
||||
IMS_PER_BATCH: 16
|
||||
BASE_LR: 0.01
|
||||
STEPS: (60000, 80000)
|
||||
MAX_ITER: 90000
|
||||
WARMUP_FACTOR: 0.1
|
||||
INPUT:
|
||||
MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
|
|
@ -0,0 +1,16 @@
|
|||
_BASE_: "../Base-DensePose-RCNN-FPN.yaml"
|
||||
MODEL:
|
||||
WEIGHTS: "https://1drv.ms/u/s!Aus8VCZ_C_33dYBMemi9xOUFR0w"
|
||||
BACKBONE:
|
||||
NAME: "build_hrfpn_backbone"
|
||||
RPN:
|
||||
IN_FEATURES: ['p1', 'p2', 'p3', 'p4', 'p5']
|
||||
ROI_HEADS:
|
||||
IN_FEATURES: ['p1', 'p2', 'p3', 'p4', 'p5']
|
||||
SOLVER:
|
||||
MAX_ITER: 130000
|
||||
STEPS: (100000, 120000)
|
||||
CLIP_GRADIENTS:
|
||||
ENABLED: True
|
||||
CLIP_TYPE: "norm"
|
||||
BASE_LR: 0.03
|
|
@ -0,0 +1,23 @@
|
|||
_BASE_: "../Base-DensePose-RCNN-FPN.yaml"
|
||||
MODEL:
|
||||
WEIGHTS: "https://1drv.ms/u/s!Aus8VCZ_C_33ck0gvo5jfoWBOPo"
|
||||
BACKBONE:
|
||||
NAME: "build_hrfpn_backbone"
|
||||
RPN:
|
||||
IN_FEATURES: ['p1', 'p2', 'p3', 'p4', 'p5']
|
||||
ROI_HEADS:
|
||||
IN_FEATURES: ['p1', 'p2', 'p3', 'p4', 'p5']
|
||||
HRNET:
|
||||
STAGE2:
|
||||
NUM_CHANNELS: [40, 80]
|
||||
STAGE3:
|
||||
NUM_CHANNELS: [40, 80, 160]
|
||||
STAGE4:
|
||||
NUM_CHANNELS: [40, 80, 160, 320]
|
||||
SOLVER:
|
||||
MAX_ITER: 130000
|
||||
STEPS: (100000, 120000)
|
||||
CLIP_GRADIENTS:
|
||||
ENABLED: True
|
||||
CLIP_TYPE: "norm"
|
||||
BASE_LR: 0.03
|
|
@ -0,0 +1,23 @@
|
|||
_BASE_: "../Base-DensePose-RCNN-FPN.yaml"
|
||||
MODEL:
|
||||
WEIGHTS: "https://1drv.ms/u/s!Aus8VCZ_C_33dKvqI6pBZlifgJk"
|
||||
BACKBONE:
|
||||
NAME: "build_hrfpn_backbone"
|
||||
RPN:
|
||||
IN_FEATURES: ['p1', 'p2', 'p3', 'p4', 'p5']
|
||||
ROI_HEADS:
|
||||
IN_FEATURES: ['p1', 'p2', 'p3', 'p4', 'p5']
|
||||
HRNET:
|
||||
STAGE2:
|
||||
NUM_CHANNELS: [48, 96]
|
||||
STAGE3:
|
||||
NUM_CHANNELS: [48, 96, 192]
|
||||
STAGE4:
|
||||
NUM_CHANNELS: [48, 96, 192, 384]
|
||||
SOLVER:
|
||||
MAX_ITER: 130000
|
||||
STEPS: (100000, 120000)
|
||||
CLIP_GRADIENTS:
|
||||
ENABLED: True
|
||||
CLIP_TYPE: "norm"
|
||||
BASE_LR: 0.03
|
|
@ -0,0 +1,18 @@
|
|||
_BASE_: "Base-DensePose-RCNN-FPN.yaml"
|
||||
MODEL:
|
||||
WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-101.pkl"
|
||||
RESNETS:
|
||||
DEPTH: 101
|
||||
ROI_DENSEPOSE_HEAD:
|
||||
NAME: "DensePoseDeepLabHead"
|
||||
UV_CONFIDENCE:
|
||||
ENABLED: True
|
||||
TYPE: "iid_iso"
|
||||
SEGM_CONFIDENCE:
|
||||
ENABLED: True
|
||||
POINT_REGRESSION_WEIGHTS: 0.0005
|
||||
SOLVER:
|
||||
CLIP_GRADIENTS:
|
||||
ENABLED: True
|
||||
MAX_ITER: 130000
|
||||
STEPS: (100000, 120000)
|
|
@ -0,0 +1,16 @@
|
|||
_BASE_: "Base-DensePose-RCNN-FPN.yaml"
|
||||
MODEL:
|
||||
WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-101.pkl"
|
||||
RESNETS:
|
||||
DEPTH: 101
|
||||
ROI_DENSEPOSE_HEAD:
|
||||
NAME: "DensePoseDeepLabHead"
|
||||
UV_CONFIDENCE:
|
||||
ENABLED: True
|
||||
TYPE: "iid_iso"
|
||||
POINT_REGRESSION_WEIGHTS: 0.0005
|
||||
SOLVER:
|
||||
CLIP_GRADIENTS:
|
||||
ENABLED: True
|
||||
MAX_ITER: 130000
|
||||
STEPS: (100000, 120000)
|
|
@ -0,0 +1,18 @@
|
|||
_BASE_: "Base-DensePose-RCNN-FPN.yaml"
|
||||
MODEL:
|
||||
WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-101.pkl"
|
||||
RESNETS:
|
||||
DEPTH: 101
|
||||
ROI_DENSEPOSE_HEAD:
|
||||
NAME: "DensePoseDeepLabHead"
|
||||
UV_CONFIDENCE:
|
||||
ENABLED: True
|
||||
TYPE: "indep_aniso"
|
||||
SEGM_CONFIDENCE:
|
||||
ENABLED: True
|
||||
POINT_REGRESSION_WEIGHTS: 0.0005
|
||||
SOLVER:
|
||||
CLIP_GRADIENTS:
|
||||
ENABLED: True
|
||||
MAX_ITER: 130000
|
||||
STEPS: (100000, 120000)
|
|
@ -0,0 +1,16 @@
|
|||
_BASE_: "Base-DensePose-RCNN-FPN.yaml"
|
||||
MODEL:
|
||||
WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-101.pkl"
|
||||
RESNETS:
|
||||
DEPTH: 101
|
||||
ROI_DENSEPOSE_HEAD:
|
||||
NAME: "DensePoseDeepLabHead"
|
||||
UV_CONFIDENCE:
|
||||
ENABLED: True
|
||||
TYPE: "indep_aniso"
|
||||
POINT_REGRESSION_WEIGHTS: 0.0005
|
||||
SOLVER:
|
||||
CLIP_GRADIENTS:
|
||||
ENABLED: True
|
||||
MAX_ITER: 130000
|
||||
STEPS: (100000, 120000)
|
|
@ -0,0 +1,10 @@
|
|||
_BASE_: "Base-DensePose-RCNN-FPN.yaml"
|
||||
MODEL:
|
||||
WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-101.pkl"
|
||||
RESNETS:
|
||||
DEPTH: 101
|
||||
ROI_DENSEPOSE_HEAD:
|
||||
NAME: "DensePoseDeepLabHead"
|
||||
SOLVER:
|
||||
MAX_ITER: 130000
|
||||
STEPS: (100000, 120000)
|
|
@ -0,0 +1,18 @@
|
|||
_BASE_: "Base-DensePose-RCNN-FPN.yaml"
|
||||
MODEL:
|
||||
WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-101.pkl"
|
||||
RESNETS:
|
||||
DEPTH: 101
|
||||
ROI_DENSEPOSE_HEAD:
|
||||
UV_CONFIDENCE:
|
||||
ENABLED: True
|
||||
TYPE: "iid_iso"
|
||||
SEGM_CONFIDENCE:
|
||||
ENABLED: True
|
||||
POINT_REGRESSION_WEIGHTS: 0.0005
|
||||
SOLVER:
|
||||
CLIP_GRADIENTS:
|
||||
ENABLED: True
|
||||
MAX_ITER: 130000
|
||||
STEPS: (100000, 120000)
|
||||
WARMUP_FACTOR: 0.025
|
|
@ -0,0 +1,16 @@
|
|||
_BASE_: "Base-DensePose-RCNN-FPN.yaml"
|
||||
MODEL:
|
||||
WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-101.pkl"
|
||||
RESNETS:
|
||||
DEPTH: 101
|
||||
ROI_DENSEPOSE_HEAD:
|
||||
UV_CONFIDENCE:
|
||||
ENABLED: True
|
||||
TYPE: "iid_iso"
|
||||
POINT_REGRESSION_WEIGHTS: 0.0005
|
||||
SOLVER:
|
||||
CLIP_GRADIENTS:
|
||||
ENABLED: True
|
||||
MAX_ITER: 130000
|
||||
STEPS: (100000, 120000)
|
||||
WARMUP_FACTOR: 0.025
|
|
@ -0,0 +1,18 @@
|
|||
_BASE_: "Base-DensePose-RCNN-FPN.yaml"
|
||||
MODEL:
|
||||
WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-101.pkl"
|
||||
RESNETS:
|
||||
DEPTH: 101
|
||||
ROI_DENSEPOSE_HEAD:
|
||||
UV_CONFIDENCE:
|
||||
ENABLED: True
|
||||
TYPE: "indep_aniso"
|
||||
SEGM_CONFIDENCE:
|
||||
ENABLED: True
|
||||
POINT_REGRESSION_WEIGHTS: 0.0005
|
||||
SOLVER:
|
||||
CLIP_GRADIENTS:
|
||||
ENABLED: True
|
||||
MAX_ITER: 130000
|
||||
STEPS: (100000, 120000)
|
||||
WARMUP_FACTOR: 0.025
|
|
@ -0,0 +1,16 @@
|
|||
_BASE_: "Base-DensePose-RCNN-FPN.yaml"
|
||||
MODEL:
|
||||
WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-101.pkl"
|
||||
RESNETS:
|
||||
DEPTH: 101
|
||||
ROI_DENSEPOSE_HEAD:
|
||||
UV_CONFIDENCE:
|
||||
ENABLED: True
|
||||
TYPE: "indep_aniso"
|
||||
POINT_REGRESSION_WEIGHTS: 0.0005
|
||||
SOLVER:
|
||||
CLIP_GRADIENTS:
|
||||
ENABLED: True
|
||||
MAX_ITER: 130000
|
||||
STEPS: (100000, 120000)
|
||||
WARMUP_FACTOR: 0.025
|
|
@ -0,0 +1,8 @@
|
|||
_BASE_: "Base-DensePose-RCNN-FPN.yaml"
|
||||
MODEL:
|
||||
WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-101.pkl"
|
||||
RESNETS:
|
||||
DEPTH: 101
|
||||
SOLVER:
|
||||
MAX_ITER: 130000
|
||||
STEPS: (100000, 120000)
|
|
@ -0,0 +1,17 @@
|
|||
_BASE_: "Base-DensePose-RCNN-FPN.yaml"
|
||||
MODEL:
|
||||
WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-101.pkl"
|
||||
RESNETS:
|
||||
DEPTH: 101
|
||||
ROI_DENSEPOSE_HEAD:
|
||||
NUM_COARSE_SEGM_CHANNELS: 15
|
||||
POOLER_RESOLUTION: 14
|
||||
HEATMAP_SIZE: 56
|
||||
INDEX_WEIGHTS: 2.0
|
||||
PART_WEIGHTS: 0.3
|
||||
POINT_REGRESSION_WEIGHTS: 0.1
|
||||
DECODER_ON: False
|
||||
SOLVER:
|
||||
BASE_LR: 0.002
|
||||
MAX_ITER: 130000
|
||||
STEPS: (100000, 120000)
|
|
@ -0,0 +1,18 @@
|
|||
_BASE_: "Base-DensePose-RCNN-FPN.yaml"
|
||||
MODEL:
|
||||
WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
|
||||
RESNETS:
|
||||
DEPTH: 50
|
||||
ROI_DENSEPOSE_HEAD:
|
||||
NAME: "DensePoseDeepLabHead"
|
||||
UV_CONFIDENCE:
|
||||
ENABLED: True
|
||||
TYPE: "iid_iso"
|
||||
SEGM_CONFIDENCE:
|
||||
ENABLED: True
|
||||
POINT_REGRESSION_WEIGHTS: 0.0005
|
||||
SOLVER:
|
||||
CLIP_GRADIENTS:
|
||||
ENABLED: True
|
||||
MAX_ITER: 130000
|
||||
STEPS: (100000, 120000)
|
|
@ -0,0 +1,16 @@
|
|||
_BASE_: "Base-DensePose-RCNN-FPN.yaml"
|
||||
MODEL:
|
||||
WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
|
||||
RESNETS:
|
||||
DEPTH: 50
|
||||
ROI_DENSEPOSE_HEAD:
|
||||
NAME: "DensePoseDeepLabHead"
|
||||
UV_CONFIDENCE:
|
||||
ENABLED: True
|
||||
TYPE: "iid_iso"
|
||||
POINT_REGRESSION_WEIGHTS: 0.0005
|
||||
SOLVER:
|
||||
CLIP_GRADIENTS:
|
||||
ENABLED: True
|
||||
MAX_ITER: 130000
|
||||
STEPS: (100000, 120000)
|
|
@ -0,0 +1,18 @@
|
|||
_BASE_: "Base-DensePose-RCNN-FPN.yaml"
|
||||
MODEL:
|
||||
WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
|
||||
RESNETS:
|
||||
DEPTH: 50
|
||||
ROI_DENSEPOSE_HEAD:
|
||||
NAME: "DensePoseDeepLabHead"
|
||||
UV_CONFIDENCE:
|
||||
ENABLED: True
|
||||
TYPE: "indep_aniso"
|
||||
SEGM_CONFIDENCE:
|
||||
ENABLED: True
|
||||
POINT_REGRESSION_WEIGHTS: 0.0005
|
||||
SOLVER:
|
||||
CLIP_GRADIENTS:
|
||||
ENABLED: True
|
||||
MAX_ITER: 130000
|
||||
STEPS: (100000, 120000)
|
|
@ -0,0 +1,16 @@
|
|||
_BASE_: "Base-DensePose-RCNN-FPN.yaml"
|
||||
MODEL:
|
||||
WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
|
||||
RESNETS:
|
||||
DEPTH: 50
|
||||
ROI_DENSEPOSE_HEAD:
|
||||
NAME: "DensePoseDeepLabHead"
|
||||
UV_CONFIDENCE:
|
||||
ENABLED: True
|
||||
TYPE: "indep_aniso"
|
||||
POINT_REGRESSION_WEIGHTS: 0.0005
|
||||
SOLVER:
|
||||
CLIP_GRADIENTS:
|
||||
ENABLED: True
|
||||
MAX_ITER: 130000
|
||||
STEPS: (100000, 120000)
|
|
@ -0,0 +1,10 @@
|
|||
_BASE_: "Base-DensePose-RCNN-FPN.yaml"
|
||||
MODEL:
|
||||
WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
|
||||
RESNETS:
|
||||
DEPTH: 50
|
||||
ROI_DENSEPOSE_HEAD:
|
||||
NAME: "DensePoseDeepLabHead"
|
||||
SOLVER:
|
||||
MAX_ITER: 130000
|
||||
STEPS: (100000, 120000)
|
|
@ -0,0 +1,20 @@
|
|||
_BASE_: "Base-DensePose-RCNN-FPN.yaml"
|
||||
MODEL:
|
||||
WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
|
||||
RESNETS:
|
||||
DEPTH: 50
|
||||
ROI_DENSEPOSE_HEAD:
|
||||
UV_CONFIDENCE:
|
||||
ENABLED: True
|
||||
TYPE: "iid_iso"
|
||||
SEGM_CONFIDENCE:
|
||||
ENABLED: True
|
||||
POINT_REGRESSION_WEIGHTS: 0.0005
|
||||
SOLVER:
|
||||
CLIP_GRADIENTS:
|
||||
ENABLED: True
|
||||
CLIP_TYPE: norm
|
||||
CLIP_VALUE: 100.0
|
||||
MAX_ITER: 130000
|
||||
STEPS: (100000, 120000)
|
||||
WARMUP_FACTOR: 0.025
|
|
@ -0,0 +1,16 @@
|
|||
_BASE_: "Base-DensePose-RCNN-FPN.yaml"
|
||||
MODEL:
|
||||
WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
|
||||
RESNETS:
|
||||
DEPTH: 50
|
||||
ROI_DENSEPOSE_HEAD:
|
||||
UV_CONFIDENCE:
|
||||
ENABLED: True
|
||||
TYPE: "iid_iso"
|
||||
POINT_REGRESSION_WEIGHTS: 0.0005
|
||||
SOLVER:
|
||||
CLIP_GRADIENTS:
|
||||
ENABLED: True
|
||||
MAX_ITER: 130000
|
||||
STEPS: (100000, 120000)
|
||||
WARMUP_FACTOR: 0.025
|
|
@ -0,0 +1,18 @@
|
|||
_BASE_: "Base-DensePose-RCNN-FPN.yaml"
|
||||
MODEL:
|
||||
WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
|
||||
RESNETS:
|
||||
DEPTH: 50
|
||||
ROI_DENSEPOSE_HEAD:
|
||||
UV_CONFIDENCE:
|
||||
ENABLED: True
|
||||
TYPE: "indep_aniso"
|
||||
SEGM_CONFIDENCE:
|
||||
ENABLED: True
|
||||
POINT_REGRESSION_WEIGHTS: 0.0005
|
||||
SOLVER:
|
||||
CLIP_GRADIENTS:
|
||||
ENABLED: True
|
||||
MAX_ITER: 130000
|
||||
STEPS: (100000, 120000)
|
||||
WARMUP_FACTOR: 0.025
|
|
@ -0,0 +1,16 @@
|
|||
_BASE_: "Base-DensePose-RCNN-FPN.yaml"
|
||||
MODEL:
|
||||
WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
|
||||
RESNETS:
|
||||
DEPTH: 50
|
||||
ROI_DENSEPOSE_HEAD:
|
||||
UV_CONFIDENCE:
|
||||
ENABLED: True
|
||||
TYPE: "indep_aniso"
|
||||
POINT_REGRESSION_WEIGHTS: 0.0005
|
||||
SOLVER:
|
||||
CLIP_GRADIENTS:
|
||||
ENABLED: True
|
||||
MAX_ITER: 130000
|
||||
STEPS: (100000, 120000)
|
||||
WARMUP_FACTOR: 0.025
|
|
@ -0,0 +1,8 @@
|
|||
_BASE_: "Base-DensePose-RCNN-FPN.yaml"
|
||||
MODEL:
|
||||
WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
|
||||
RESNETS:
|
||||
DEPTH: 50
|
||||
SOLVER:
|
||||
MAX_ITER: 130000
|
||||
STEPS: (100000, 120000)
|
|
@ -0,0 +1,17 @@
|
|||
_BASE_: "Base-DensePose-RCNN-FPN.yaml"
|
||||
MODEL:
|
||||
WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
|
||||
RESNETS:
|
||||
DEPTH: 50
|
||||
ROI_DENSEPOSE_HEAD:
|
||||
NUM_COARSE_SEGM_CHANNELS: 15
|
||||
POOLER_RESOLUTION: 14
|
||||
HEATMAP_SIZE: 56
|
||||
INDEX_WEIGHTS: 2.0
|
||||
PART_WEIGHTS: 0.3
|
||||
POINT_REGRESSION_WEIGHTS: 0.1
|
||||
DECODER_ON: False
|
||||
SOLVER:
|
||||
BASE_LR: 0.002
|
||||
MAX_ITER: 130000
|
||||
STEPS: (100000, 120000)
|
|
@ -0,0 +1,121 @@
|
|||
MODEL:
|
||||
META_ARCHITECTURE: "GeneralizedRCNN"
|
||||
BACKBONE:
|
||||
NAME: "build_resnet_fpn_backbone"
|
||||
RESNETS:
|
||||
OUT_FEATURES: ["res2", "res3", "res4", "res5"]
|
||||
FPN:
|
||||
IN_FEATURES: ["res2", "res3", "res4", "res5"]
|
||||
ANCHOR_GENERATOR:
|
||||
SIZES: [[32], [64], [128], [256], [512]] # One size for each in feature map
|
||||
ASPECT_RATIOS: [[0.5, 1.0, 2.0]] # Three aspect ratios (same for all in feature maps)
|
||||
RPN:
|
||||
IN_FEATURES: ["p2", "p3", "p4", "p5", "p6"]
|
||||
PRE_NMS_TOPK_TRAIN: 2000 # Per FPN level
|
||||
PRE_NMS_TOPK_TEST: 1000 # Per FPN level
|
||||
# Detectron1 uses 2000 proposals per-batch,
|
||||
# (See "modeling/rpn/rpn_outputs.py" for details of this legacy issue)
|
||||
# which is approximately 1000 proposals per-image since the default batch size for FPN is 2.
|
||||
POST_NMS_TOPK_TRAIN: 1000
|
||||
POST_NMS_TOPK_TEST: 1000
|
||||
ROI_HEADS:
|
||||
NAME: "StandardROIHeads"
|
||||
IN_FEATURES: ["p2", "p3", "p4", "p5"]
|
||||
NUM_CLASSES: 1
|
||||
ROI_BOX_HEAD:
|
||||
NAME: "FastRCNNConvFCHead"
|
||||
NUM_FC: 2
|
||||
POOLER_RESOLUTION: 7
|
||||
ROI_MASK_HEAD:
|
||||
NAME: "MaskRCNNConvUpsampleHead"
|
||||
NUM_CONV: 4
|
||||
POOLER_RESOLUTION: 14
|
||||
DATASETS:
|
||||
TRAIN: ("base_coco_2017_train",)
|
||||
TEST: ("base_coco_2017_val", "densepose_chimps")
|
||||
CATEGORY_MAPS:
|
||||
"base_coco_2017_train":
|
||||
"16": 1 # bird -> person
|
||||
"17": 1 # cat -> person
|
||||
"18": 1 # dog -> person
|
||||
"19": 1 # horse -> person
|
||||
"20": 1 # sheep -> person
|
||||
"21": 1 # cow -> person
|
||||
"22": 1 # elephant -> person
|
||||
"23": 1 # bear -> person
|
||||
"24": 1 # zebra -> person
|
||||
"25": 1 # girafe -> person
|
||||
"base_coco_2017_val":
|
||||
"16": 1 # bird -> person
|
||||
"17": 1 # cat -> person
|
||||
"18": 1 # dog -> person
|
||||
"19": 1 # horse -> person
|
||||
"20": 1 # sheep -> person
|
||||
"21": 1 # cow -> person
|
||||
"22": 1 # elephant -> person
|
||||
"23": 1 # bear -> person
|
||||
"24": 1 # zebra -> person
|
||||
"25": 1 # girafe -> person
|
||||
WHITELISTED_CATEGORIES:
|
||||
"base_coco_2017_train":
|
||||
- 1 # person
|
||||
- 16 # bird
|
||||
- 17 # cat
|
||||
- 18 # dog
|
||||
- 19 # horse
|
||||
- 20 # sheep
|
||||
- 21 # cow
|
||||
- 22 # elephant
|
||||
- 23 # bear
|
||||
- 24 # zebra
|
||||
- 25 # girafe
|
||||
"base_coco_2017_val":
|
||||
- 1 # person
|
||||
- 16 # bird
|
||||
- 17 # cat
|
||||
- 18 # dog
|
||||
- 19 # horse
|
||||
- 20 # sheep
|
||||
- 21 # cow
|
||||
- 22 # elephant
|
||||
- 23 # bear
|
||||
- 24 # zebra
|
||||
- 25 # girafe
|
||||
BOOTSTRAP_DATASETS:
|
||||
- DATASET: "chimpnsee"
|
||||
RATIO: 1.0
|
||||
IMAGE_LOADER:
|
||||
TYPE: "video_keyframe"
|
||||
SELECT:
|
||||
STRATEGY: "random_k"
|
||||
NUM_IMAGES: 4
|
||||
TRANSFORM:
|
||||
TYPE: "resize"
|
||||
MIN_SIZE: 800
|
||||
MAX_SIZE: 1333
|
||||
BATCH_SIZE: 8
|
||||
NUM_WORKERS: 1
|
||||
INFERENCE:
|
||||
INPUT_BATCH_SIZE: 1
|
||||
OUTPUT_BATCH_SIZE: 1
|
||||
DATA_SAMPLER:
|
||||
# supported types:
|
||||
# densepose_uniform
|
||||
# densepose_UV_confidence
|
||||
# densepose_fine_segm_confidence
|
||||
# densepose_coarse_segm_confidence
|
||||
TYPE: "densepose_uniform"
|
||||
COUNT_PER_CLASS: 8
|
||||
FILTER:
|
||||
TYPE: "detection_score"
|
||||
MIN_VALUE: 0.8
|
||||
BOOTSTRAP_MODEL:
|
||||
WEIGHTS: ""
|
||||
SOLVER:
|
||||
IMS_PER_BATCH: 16
|
||||
BASE_LR: 0.02
|
||||
STEPS: (60000, 80000)
|
||||
MAX_ITER: 90000
|
||||
INPUT:
|
||||
MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
|
||||
VERSION: 2
|
|
@ -0,0 +1,91 @@
|
|||
MODEL:
|
||||
META_ARCHITECTURE: "GeneralizedRCNN"
|
||||
BACKBONE:
|
||||
NAME: "build_resnet_fpn_backbone"
|
||||
RESNETS:
|
||||
OUT_FEATURES: ["res2", "res3", "res4", "res5"]
|
||||
FPN:
|
||||
IN_FEATURES: ["res2", "res3", "res4", "res5"]
|
||||
ANCHOR_GENERATOR:
|
||||
SIZES: [[32], [64], [128], [256], [512]] # One size for each in feature map
|
||||
ASPECT_RATIOS: [[0.5, 1.0, 2.0]] # Three aspect ratios (same for all in feature maps)
|
||||
RPN:
|
||||
IN_FEATURES: ["p2", "p3", "p4", "p5", "p6"]
|
||||
PRE_NMS_TOPK_TRAIN: 2000 # Per FPN level
|
||||
PRE_NMS_TOPK_TEST: 1000 # Per FPN level
|
||||
# Detectron1 uses 2000 proposals per-batch,
|
||||
# (See "modeling/rpn/rpn_outputs.py" for details of this legacy issue)
|
||||
# which is approximately 1000 proposals per-image since the default batch size for FPN is 2.
|
||||
POST_NMS_TOPK_TRAIN: 1000
|
||||
POST_NMS_TOPK_TEST: 1000
|
||||
ROI_HEADS:
|
||||
NAME: "StandardROIHeads"
|
||||
IN_FEATURES: ["p2", "p3", "p4", "p5"]
|
||||
NUM_CLASSES: 1
|
||||
ROI_BOX_HEAD:
|
||||
NAME: "FastRCNNConvFCHead"
|
||||
NUM_FC: 2
|
||||
POOLER_RESOLUTION: 7
|
||||
ROI_MASK_HEAD:
|
||||
NAME: "MaskRCNNConvUpsampleHead"
|
||||
NUM_CONV: 4
|
||||
POOLER_RESOLUTION: 14
|
||||
DATASETS:
|
||||
TRAIN: ("base_coco_2017_train",)
|
||||
TEST: ("base_coco_2017_val", "densepose_chimps")
|
||||
CATEGORY_MAPS:
|
||||
"base_coco_2017_train":
|
||||
"16": 1 # bird -> person
|
||||
"17": 1 # cat -> person
|
||||
"18": 1 # dog -> person
|
||||
"19": 1 # horse -> person
|
||||
"20": 1 # sheep -> person
|
||||
"21": 1 # cow -> person
|
||||
"22": 1 # elephant -> person
|
||||
"23": 1 # bear -> person
|
||||
"24": 1 # zebra -> person
|
||||
"25": 1 # girafe -> person
|
||||
"base_coco_2017_val":
|
||||
"16": 1 # bird -> person
|
||||
"17": 1 # cat -> person
|
||||
"18": 1 # dog -> person
|
||||
"19": 1 # horse -> person
|
||||
"20": 1 # sheep -> person
|
||||
"21": 1 # cow -> person
|
||||
"22": 1 # elephant -> person
|
||||
"23": 1 # bear -> person
|
||||
"24": 1 # zebra -> person
|
||||
"25": 1 # girafe -> person
|
||||
WHITELISTED_CATEGORIES:
|
||||
"base_coco_2017_train":
|
||||
- 1 # person
|
||||
- 16 # bird
|
||||
- 17 # cat
|
||||
- 18 # dog
|
||||
- 19 # horse
|
||||
- 20 # sheep
|
||||
- 21 # cow
|
||||
- 22 # elephant
|
||||
- 23 # bear
|
||||
- 24 # zebra
|
||||
- 25 # girafe
|
||||
"base_coco_2017_val":
|
||||
- 1 # person
|
||||
- 16 # bird
|
||||
- 17 # cat
|
||||
- 18 # dog
|
||||
- 19 # horse
|
||||
- 20 # sheep
|
||||
- 21 # cow
|
||||
- 22 # elephant
|
||||
- 23 # bear
|
||||
- 24 # zebra
|
||||
- 25 # girafe
|
||||
SOLVER:
|
||||
IMS_PER_BATCH: 16
|
||||
BASE_LR: 0.02
|
||||
STEPS: (60000, 80000)
|
||||
MAX_ITER: 90000
|
||||
INPUT:
|
||||
MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
|
||||
VERSION: 2
|
|
@ -0,0 +1,19 @@
|
|||
_BASE_: "Base-RCNN-FPN-MC.yaml"
|
||||
MODEL:
|
||||
WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-101.pkl"
|
||||
RESNETS:
|
||||
DEPTH: 101
|
||||
DENSEPOSE_ON: True
|
||||
ROI_HEADS:
|
||||
NAME: "DensePoseROIHeads"
|
||||
IN_FEATURES: ["p2", "p3", "p4", "p5"]
|
||||
NUM_CLASSES: 1
|
||||
ROI_DENSEPOSE_HEAD:
|
||||
NAME: "DensePoseV1ConvXHead"
|
||||
POOLER_TYPE: "ROIAlign"
|
||||
NUM_COARSE_SEGM_CHANNELS: 2
|
||||
COARSE_SEGM_TRAINED_BY_MASKS: True
|
||||
INDEX_WEIGHTS: 1.0
|
||||
DATASETS:
|
||||
TRAIN: ("base_coco_2017_train", "densepose_coco_2014_train")
|
||||
TEST: ("densepose_chimps",)
|
|
@ -0,0 +1,19 @@
|
|||
_BASE_: "Base-RCNN-FPN-MC.yaml"
|
||||
MODEL:
|
||||
WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-101.pkl"
|
||||
RESNETS:
|
||||
DEPTH: 101
|
||||
DENSEPOSE_ON: True
|
||||
ROI_HEADS:
|
||||
NAME: "DensePoseROIHeads"
|
||||
IN_FEATURES: ["p2", "p3", "p4", "p5"]
|
||||
NUM_CLASSES: 1
|
||||
ROI_DENSEPOSE_HEAD:
|
||||
NAME: "DensePoseDeepLabHead"
|
||||
POOLER_TYPE: "ROIAlign"
|
||||
NUM_COARSE_SEGM_CHANNELS: 2
|
||||
COARSE_SEGM_TRAINED_BY_MASKS: True
|
||||
INDEX_WEIGHTS: 1.0
|
||||
DATASETS:
|
||||
TRAIN: ("base_coco_2017_train", "densepose_coco_2014_train")
|
||||
TEST: ("densepose_chimps",)
|
|
@ -0,0 +1,29 @@
|
|||
_BASE_: "Base-RCNN-FPN-MC.yaml"
|
||||
MODEL:
|
||||
WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-101.pkl"
|
||||
RESNETS:
|
||||
DEPTH: 101
|
||||
DENSEPOSE_ON: True
|
||||
ROI_HEADS:
|
||||
NAME: "DensePoseROIHeads"
|
||||
IN_FEATURES: ["p2", "p3", "p4", "p5"]
|
||||
NUM_CLASSES: 1
|
||||
ROI_DENSEPOSE_HEAD:
|
||||
NAME: "DensePoseDeepLabHead"
|
||||
UV_CONFIDENCE:
|
||||
ENABLED: True
|
||||
TYPE: "iid_iso"
|
||||
SEGM_CONFIDENCE:
|
||||
ENABLED: True
|
||||
POINT_REGRESSION_WEIGHTS: 0.0005
|
||||
POOLER_TYPE: "ROIAlign"
|
||||
NUM_COARSE_SEGM_CHANNELS: 2
|
||||
COARSE_SEGM_TRAINED_BY_MASKS: True
|
||||
INDEX_WEIGHTS: 1.0
|
||||
SOLVER:
|
||||
CLIP_GRADIENTS:
|
||||
ENABLED: True
|
||||
WARMUP_FACTOR: 0.025
|
||||
DATASETS:
|
||||
TRAIN: ("base_coco_2017_train", "densepose_coco_2014_train")
|
||||
TEST: ("densepose_chimps",)
|
|
@ -0,0 +1,27 @@
|
|||
_BASE_: "Base-RCNN-FPN-MC.yaml"
|
||||
MODEL:
|
||||
WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-101.pkl"
|
||||
RESNETS:
|
||||
DEPTH: 101
|
||||
DENSEPOSE_ON: True
|
||||
ROI_HEADS:
|
||||
NAME: "DensePoseROIHeads"
|
||||
IN_FEATURES: ["p2", "p3", "p4", "p5"]
|
||||
NUM_CLASSES: 1
|
||||
ROI_DENSEPOSE_HEAD:
|
||||
NAME: "DensePoseDeepLabHead"
|
||||
UV_CONFIDENCE:
|
||||
ENABLED: True
|
||||
TYPE: "iid_iso"
|
||||
POINT_REGRESSION_WEIGHTS: 0.0005
|
||||
POOLER_TYPE: "ROIAlign"
|
||||
NUM_COARSE_SEGM_CHANNELS: 2
|
||||
COARSE_SEGM_TRAINED_BY_MASKS: True
|
||||
INDEX_WEIGHTS: 1.0
|
||||
SOLVER:
|
||||
CLIP_GRADIENTS:
|
||||
ENABLED: True
|
||||
WARMUP_FACTOR: 0.025
|
||||
DATASETS:
|
||||
TRAIN: ("base_coco_2017_train", "densepose_coco_2014_train")
|
||||
TEST: ("densepose_chimps",)
|
|
@ -0,0 +1,29 @@
|
|||
_BASE_: "Base-RCNN-FPN-MC.yaml"
|
||||
MODEL:
|
||||
WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-101.pkl"
|
||||
RESNETS:
|
||||
DEPTH: 101
|
||||
DENSEPOSE_ON: True
|
||||
ROI_HEADS:
|
||||
NAME: "DensePoseROIHeads"
|
||||
IN_FEATURES: ["p2", "p3", "p4", "p5"]
|
||||
NUM_CLASSES: 1
|
||||
ROI_DENSEPOSE_HEAD:
|
||||
NAME: "DensePoseV1ConvXHead"
|
||||
UV_CONFIDENCE:
|
||||
ENABLED: True
|
||||
TYPE: "iid_iso"
|
||||
SEGM_CONFIDENCE:
|
||||
ENABLED: True
|
||||
POINT_REGRESSION_WEIGHTS: 0.0005
|
||||
POOLER_TYPE: "ROIAlign"
|
||||
NUM_COARSE_SEGM_CHANNELS: 2
|
||||
COARSE_SEGM_TRAINED_BY_MASKS: True
|
||||
INDEX_WEIGHTS: 1.0
|
||||
SOLVER:
|
||||
CLIP_GRADIENTS:
|
||||
ENABLED: True
|
||||
WARMUP_FACTOR: 0.025
|
||||
DATASETS:
|
||||
TRAIN: ("base_coco_2017_train", "densepose_coco_2014_train")
|
||||
TEST: ("densepose_chimps",)
|
|
@ -0,0 +1,27 @@
|
|||
_BASE_: "Base-RCNN-FPN-MC.yaml"
|
||||
MODEL:
|
||||
WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-101.pkl"
|
||||
RESNETS:
|
||||
DEPTH: 101
|
||||
DENSEPOSE_ON: True
|
||||
ROI_HEADS:
|
||||
NAME: "DensePoseROIHeads"
|
||||
IN_FEATURES: ["p2", "p3", "p4", "p5"]
|
||||
NUM_CLASSES: 1
|
||||
ROI_DENSEPOSE_HEAD:
|
||||
NAME: "DensePoseV1ConvXHead"
|
||||
UV_CONFIDENCE:
|
||||
ENABLED: True
|
||||
TYPE: "iid_iso"
|
||||
POINT_REGRESSION_WEIGHTS: 0.0005
|
||||
POOLER_TYPE: "ROIAlign"
|
||||
NUM_COARSE_SEGM_CHANNELS: 2
|
||||
COARSE_SEGM_TRAINED_BY_MASKS: True
|
||||
INDEX_WEIGHTS: 1.0
|
||||
SOLVER:
|
||||
CLIP_GRADIENTS:
|
||||
ENABLED: True
|
||||
WARMUP_FACTOR: 0.025
|
||||
DATASETS:
|
||||
TRAIN: ("base_coco_2017_train", "densepose_coco_2014_train")
|
||||
TEST: ("densepose_chimps",)
|
|
@ -0,0 +1,19 @@
|
|||
_BASE_: "Base-RCNN-FPN-MC.yaml"
|
||||
MODEL:
|
||||
WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
|
||||
RESNETS:
|
||||
DEPTH: 50
|
||||
DENSEPOSE_ON: True
|
||||
ROI_HEADS:
|
||||
NAME: "DensePoseROIHeads"
|
||||
IN_FEATURES: ["p2", "p3", "p4", "p5"]
|
||||
NUM_CLASSES: 1
|
||||
ROI_DENSEPOSE_HEAD:
|
||||
NAME: "DensePoseV1ConvXHead"
|
||||
POOLER_TYPE: "ROIAlign"
|
||||
NUM_COARSE_SEGM_CHANNELS: 2
|
||||
COARSE_SEGM_TRAINED_BY_MASKS: True
|
||||
INDEX_WEIGHTS: 1.0
|
||||
DATASETS:
|
||||
TRAIN: ("base_coco_2017_train", "densepose_coco_2014_train")
|
||||
TEST: ("densepose_chimps",)
|
|
@ -0,0 +1,19 @@
|
|||
_BASE_: "Base-RCNN-FPN-MC.yaml"
|
||||
MODEL:
|
||||
WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
|
||||
RESNETS:
|
||||
DEPTH: 50
|
||||
DENSEPOSE_ON: True
|
||||
ROI_HEADS:
|
||||
NAME: "DensePoseROIHeads"
|
||||
IN_FEATURES: ["p2", "p3", "p4", "p5"]
|
||||
NUM_CLASSES: 1
|
||||
ROI_DENSEPOSE_HEAD:
|
||||
NAME: "DensePoseDeepLabHead"
|
||||
POOLER_TYPE: "ROIAlign"
|
||||
NUM_COARSE_SEGM_CHANNELS: 2
|
||||
COARSE_SEGM_TRAINED_BY_MASKS: True
|
||||
INDEX_WEIGHTS: 1.0
|
||||
DATASETS:
|
||||
TRAIN: ("base_coco_2017_train", "densepose_coco_2014_train")
|
||||
TEST: ("densepose_chimps",)
|
|
@ -0,0 +1,29 @@
|
|||
_BASE_: "Base-RCNN-FPN-MC.yaml"
|
||||
MODEL:
|
||||
WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
|
||||
RESNETS:
|
||||
DEPTH: 50
|
||||
DENSEPOSE_ON: True
|
||||
ROI_HEADS:
|
||||
NAME: "DensePoseROIHeads"
|
||||
IN_FEATURES: ["p2", "p3", "p4", "p5"]
|
||||
NUM_CLASSES: 1
|
||||
ROI_DENSEPOSE_HEAD:
|
||||
NAME: "DensePoseDeepLabHead"
|
||||
UV_CONFIDENCE:
|
||||
ENABLED: True
|
||||
TYPE: "iid_iso"
|
||||
SEGM_CONFIDENCE:
|
||||
ENABLED: True
|
||||
POINT_REGRESSION_WEIGHTS: 0.0005
|
||||
POOLER_TYPE: "ROIAlign"
|
||||
NUM_COARSE_SEGM_CHANNELS: 2
|
||||
COARSE_SEGM_TRAINED_BY_MASKS: True
|
||||
INDEX_WEIGHTS: 1.0
|
||||
SOLVER:
|
||||
CLIP_GRADIENTS:
|
||||
ENABLED: True
|
||||
WARMUP_FACTOR: 0.025
|
||||
DATASETS:
|
||||
TRAIN: ("base_coco_2017_train", "densepose_coco_2014_train")
|
||||
TEST: ("densepose_chimps",)
|
|
@ -0,0 +1,27 @@
|
|||
_BASE_: "Base-RCNN-FPN-MC.yaml"
|
||||
MODEL:
|
||||
WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
|
||||
RESNETS:
|
||||
DEPTH: 50
|
||||
DENSEPOSE_ON: True
|
||||
ROI_HEADS:
|
||||
NAME: "DensePoseROIHeads"
|
||||
IN_FEATURES: ["p2", "p3", "p4", "p5"]
|
||||
NUM_CLASSES: 1
|
||||
ROI_DENSEPOSE_HEAD:
|
||||
NAME: "DensePoseDeepLabHead"
|
||||
UV_CONFIDENCE:
|
||||
ENABLED: True
|
||||
TYPE: "iid_iso"
|
||||
POINT_REGRESSION_WEIGHTS: 0.0005
|
||||
POOLER_TYPE: "ROIAlign"
|
||||
NUM_COARSE_SEGM_CHANNELS: 2
|
||||
COARSE_SEGM_TRAINED_BY_MASKS: True
|
||||
INDEX_WEIGHTS: 1.0
|
||||
SOLVER:
|
||||
CLIP_GRADIENTS:
|
||||
ENABLED: True
|
||||
WARMUP_FACTOR: 0.025
|
||||
DATASETS:
|
||||
TRAIN: ("base_coco_2017_train", "densepose_coco_2014_train")
|
||||
TEST: ("densepose_chimps",)
|
|
@ -0,0 +1,29 @@
|
|||
_BASE_: "Base-RCNN-FPN-MC.yaml"
|
||||
MODEL:
|
||||
WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
|
||||
RESNETS:
|
||||
DEPTH: 50
|
||||
DENSEPOSE_ON: True
|
||||
ROI_HEADS:
|
||||
NAME: "DensePoseROIHeads"
|
||||
IN_FEATURES: ["p2", "p3", "p4", "p5"]
|
||||
NUM_CLASSES: 1
|
||||
ROI_DENSEPOSE_HEAD:
|
||||
NAME: "DensePoseV1ConvXHead"
|
||||
UV_CONFIDENCE:
|
||||
ENABLED: True
|
||||
TYPE: "iid_iso"
|
||||
SEGM_CONFIDENCE:
|
||||
ENABLED: True
|
||||
POINT_REGRESSION_WEIGHTS: 0.0005
|
||||
POOLER_TYPE: "ROIAlign"
|
||||
NUM_COARSE_SEGM_CHANNELS: 2
|
||||
COARSE_SEGM_TRAINED_BY_MASKS: True
|
||||
INDEX_WEIGHTS: 1.0
|
||||
SOLVER:
|
||||
CLIP_GRADIENTS:
|
||||
ENABLED: True
|
||||
WARMUP_FACTOR: 0.025
|
||||
DATASETS:
|
||||
TRAIN: ("base_coco_2017_train", "densepose_coco_2014_train")
|
||||
TEST: ("densepose_chimps",)
|
|
@ -0,0 +1,30 @@
|
|||
_BASE_: "Base-RCNN-FPN-MC-B.yaml"
|
||||
MODEL:
|
||||
WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
|
||||
RESNETS:
|
||||
DEPTH: 50
|
||||
DENSEPOSE_ON: True
|
||||
ROI_HEADS:
|
||||
NAME: "DensePoseROIHeads"
|
||||
IN_FEATURES: ["p2", "p3", "p4", "p5"]
|
||||
NUM_CLASSES: 1
|
||||
ROI_DENSEPOSE_HEAD:
|
||||
NAME: "DensePoseV1ConvXHead"
|
||||
UV_CONFIDENCE:
|
||||
ENABLED: True
|
||||
TYPE: "iid_iso"
|
||||
SEGM_CONFIDENCE:
|
||||
ENABLED: True
|
||||
POINT_REGRESSION_WEIGHTS: 0.0005
|
||||
POOLER_TYPE: "ROIAlign"
|
||||
NUM_COARSE_SEGM_CHANNELS: 2
|
||||
COARSE_SEGM_TRAINED_BY_MASKS: True
|
||||
INDEX_WEIGHTS: 1.0
|
||||
SOLVER:
|
||||
CLIP_GRADIENTS:
|
||||
ENABLED: True
|
||||
CLIP_TYPE: "norm"
|
||||
WARMUP_FACTOR: 0.025
|
||||
DATASETS:
|
||||
TRAIN: ("base_coco_2017_train", "densepose_coco_2014_train")
|
||||
TEST: ("densepose_chimps",)
|
|
@ -0,0 +1,27 @@
|
|||
_BASE_: "Base-RCNN-FPN-MC.yaml"
|
||||
MODEL:
|
||||
WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
|
||||
RESNETS:
|
||||
DEPTH: 50
|
||||
DENSEPOSE_ON: True
|
||||
ROI_HEADS:
|
||||
NAME: "DensePoseROIHeads"
|
||||
IN_FEATURES: ["p2", "p3", "p4", "p5"]
|
||||
NUM_CLASSES: 1
|
||||
ROI_DENSEPOSE_HEAD:
|
||||
NAME: "DensePoseV1ConvXHead"
|
||||
UV_CONFIDENCE:
|
||||
ENABLED: True
|
||||
TYPE: "iid_iso"
|
||||
POINT_REGRESSION_WEIGHTS: 0.0005
|
||||
POOLER_TYPE: "ROIAlign"
|
||||
NUM_COARSE_SEGM_CHANNELS: 2
|
||||
COARSE_SEGM_TRAINED_BY_MASKS: True
|
||||
INDEX_WEIGHTS: 1.0
|
||||
SOLVER:
|
||||
CLIP_GRADIENTS:
|
||||
ENABLED: True
|
||||
WARMUP_FACTOR: 0.025
|
||||
DATASETS:
|
||||
TRAIN: ("base_coco_2017_train", "densepose_coco_2014_train")
|
||||
TEST: ("densepose_chimps",)
|
|
@ -0,0 +1,7 @@
|
|||
_BASE_: "Base-RCNN-FPN-MC.yaml"
|
||||
MODEL:
|
||||
WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
|
||||
MASK_ON: False
|
||||
DENSEPOSE_ON: False
|
||||
RESNETS:
|
||||
DEPTH: 50
|
|
@ -0,0 +1,7 @@
|
|||
_BASE_: "../HRNet/densepose_rcnn_HRFPN_HRNet_w32_s1x.yaml"
|
||||
DATASETS:
|
||||
TRAIN: ("densepose_coco_2014_minival_100",)
|
||||
TEST: ("densepose_coco_2014_minival_100",)
|
||||
SOLVER:
|
||||
MAX_ITER: 40
|
||||
STEPS: (30,)
|
|
@ -0,0 +1,11 @@
|
|||
_BASE_: "../Base-DensePose-RCNN-FPN.yaml"
|
||||
MODEL:
|
||||
WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
|
||||
ROI_DENSEPOSE_HEAD:
|
||||
NAME: "DensePoseDeepLabHead"
|
||||
DATASETS:
|
||||
TRAIN: ("densepose_coco_2014_minival_100",)
|
||||
TEST: ("densepose_coco_2014_minival_100",)
|
||||
SOLVER:
|
||||
MAX_ITER: 40
|
||||
STEPS: (30,)
|
|
@ -0,0 +1,13 @@
|
|||
_BASE_: "../densepose_rcnn_R_50_FPN_s1x.yaml"
|
||||
MODEL:
|
||||
WEIGHTS: "https://dl.fbaipublicfiles.com/densepose/densepose_rcnn_R_50_FPN_s1x/165712039/model_final_162be9.pkl"
|
||||
DATASETS:
|
||||
TRAIN: ()
|
||||
TEST: ("densepose_coco_2014_minival_100",)
|
||||
TEST:
|
||||
AUG:
|
||||
ENABLED: True
|
||||
MIN_SIZES: (400, 500, 600, 700, 800, 900, 1000, 1100, 1200)
|
||||
MAX_SIZE: 4000
|
||||
FLIP: True
|
||||
EXPECTED_RESULTS: [["bbox_TTA", "AP", 61.74, 0.03], ["densepose_gps_TTA", "AP", 60.22, 0.03], ["densepose_gpsm_TTA", "AP", 63.85, 0.03]]
|
|
@ -0,0 +1,19 @@
|
|||
_BASE_: "../Base-DensePose-RCNN-FPN.yaml"
|
||||
MODEL:
|
||||
WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
|
||||
RESNETS:
|
||||
DEPTH: 50
|
||||
ROI_DENSEPOSE_HEAD:
|
||||
UV_CONFIDENCE:
|
||||
ENABLED: True
|
||||
TYPE: "iid_iso"
|
||||
POINT_REGRESSION_WEIGHTS: 0.0005
|
||||
DATASETS:
|
||||
TRAIN: ("densepose_coco_2014_minival_100",)
|
||||
TEST: ("densepose_coco_2014_minival_100",)
|
||||
SOLVER:
|
||||
CLIP_GRADIENTS:
|
||||
ENABLED: True
|
||||
MAX_ITER: 40
|
||||
STEPS: (30,)
|
||||
WARMUP_FACTOR: 0.025
|
|
@ -0,0 +1,19 @@
|
|||
_BASE_: "../Base-DensePose-RCNN-FPN.yaml"
|
||||
MODEL:
|
||||
WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
|
||||
RESNETS:
|
||||
DEPTH: 50
|
||||
ROI_DENSEPOSE_HEAD:
|
||||
UV_CONFIDENCE:
|
||||
ENABLED: True
|
||||
TYPE: "indep_aniso"
|
||||
POINT_REGRESSION_WEIGHTS: 0.0005
|
||||
DATASETS:
|
||||
TRAIN: ("densepose_coco_2014_minival_100",)
|
||||
TEST: ("densepose_coco_2014_minival_100",)
|
||||
SOLVER:
|
||||
CLIP_GRADIENTS:
|
||||
ENABLED: True
|
||||
MAX_ITER: 40
|
||||
STEPS: (30,)
|
||||
WARMUP_FACTOR: 0.025
|
|
@ -0,0 +1,8 @@
|
|||
_BASE_: "../densepose_rcnn_R_50_FPN_s1x.yaml"
|
||||
MODEL:
|
||||
WEIGHTS: "https://dl.fbaipublicfiles.com/densepose/densepose_rcnn_R_50_FPN_s1x/165712039/model_final_162be9.pkl"
|
||||
DATASETS:
|
||||
TRAIN: ()
|
||||
TEST: ("densepose_coco_2014_minival_100",)
|
||||
TEST:
|
||||
EXPECTED_RESULTS: [["bbox", "AP", 59.27, 0.025], ["densepose_gps", "AP", 60.11, 0.02], ["densepose_gpsm", "AP", 64.20, 0.02]]
|
|
@ -0,0 +1,9 @@
|
|||
_BASE_: "../Base-DensePose-RCNN-FPN.yaml"
|
||||
MODEL:
|
||||
WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
|
||||
DATASETS:
|
||||
TRAIN: ("densepose_coco_2014_minival_100",)
|
||||
TEST: ("densepose_coco_2014_minival_100",)
|
||||
SOLVER:
|
||||
MAX_ITER: 40
|
||||
STEPS: (30,)
|
|
@ -0,0 +1,18 @@
|
|||
_BASE_: "../Base-DensePose-RCNN-FPN.yaml"
|
||||
MODEL:
|
||||
WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
|
||||
ROI_HEADS:
|
||||
NUM_CLASSES: 1
|
||||
DATASETS:
|
||||
TRAIN: ("densepose_coco_2014_minival",)
|
||||
TEST: ("densepose_coco_2014_minival",)
|
||||
SOLVER:
|
||||
CLIP_GRADIENTS:
|
||||
ENABLED: True
|
||||
CLIP_TYPE: norm
|
||||
CLIP_VALUE: 1.0
|
||||
MAX_ITER: 6000
|
||||
STEPS: (5500, 5800)
|
||||
TEST:
|
||||
EXPECTED_RESULTS: [["bbox", "AP", 76.2477, 1.0], ["densepose_gps", "AP", 79.6090, 1.5], ["densepose_gpsm", "AP", 80.0061, 1.5]]
|
||||
|
|
@ -0,0 +1,171 @@
|
|||
# -*- coding = utf-8 -*-
|
||||
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
|
||||
|
||||
from detectron2.config import CfgNode as CN
|
||||
|
||||
|
||||
def add_dataset_category_config(cfg: CN):
|
||||
"""
|
||||
Add config for additional category-related dataset options
|
||||
- category whitelisting
|
||||
- category mapping
|
||||
"""
|
||||
_C = cfg
|
||||
_C.DATASETS.CATEGORY_MAPS = CN(new_allowed=True)
|
||||
_C.DATASETS.WHITELISTED_CATEGORIES = CN(new_allowed=True)
|
||||
|
||||
|
||||
def add_bootstrap_config(cfg: CN):
|
||||
"""
|
||||
"""
|
||||
_C = cfg
|
||||
_C.BOOTSTRAP_DATASETS = []
|
||||
_C.BOOTSTRAP_MODEL = CN()
|
||||
_C.BOOTSTRAP_MODEL.WEIGHTS = ""
|
||||
_C.BOOTSTRAP_MODEL.DEVICE = "cuda"
|
||||
|
||||
|
||||
def get_bootstrap_dataset_config() -> CN:
|
||||
_C = CN()
|
||||
_C.DATASET = ""
|
||||
# ratio used to mix data loaders
|
||||
_C.RATIO = 0.1
|
||||
# image loader
|
||||
_C.IMAGE_LOADER = CN(new_allowed=True)
|
||||
_C.IMAGE_LOADER.TYPE = ""
|
||||
_C.IMAGE_LOADER.BATCH_SIZE = 4
|
||||
_C.IMAGE_LOADER.NUM_WORKERS = 4
|
||||
# inference
|
||||
_C.INFERENCE = CN()
|
||||
# batch size for model inputs
|
||||
_C.INFERENCE.INPUT_BATCH_SIZE = 4
|
||||
# batch size to group model outputs
|
||||
_C.INFERENCE.OUTPUT_BATCH_SIZE = 2
|
||||
# sampled data
|
||||
_C.DATA_SAMPLER = CN(new_allowed=True)
|
||||
_C.DATA_SAMPLER.TYPE = ""
|
||||
# filter
|
||||
_C.FILTER = CN(new_allowed=True)
|
||||
_C.FILTER.TYPE = ""
|
||||
return _C
|
||||
|
||||
|
||||
def load_bootstrap_config(cfg: CN):
|
||||
"""
|
||||
Bootstrap datasets are given as a list of `dict` that are not automatically
|
||||
converted into CfgNode. This method processes all bootstrap dataset entries
|
||||
and ensures that they are in CfgNode format and comply with the specification
|
||||
"""
|
||||
if not cfg.BOOTSTRAP_DATASETS:
|
||||
return
|
||||
|
||||
bootstrap_datasets_cfgnodes = []
|
||||
for dataset_cfg in cfg.BOOTSTRAP_DATASETS:
|
||||
_C = get_bootstrap_dataset_config().clone()
|
||||
_C.merge_from_other_cfg(CN(dataset_cfg))
|
||||
bootstrap_datasets_cfgnodes.append(_C)
|
||||
cfg.BOOTSTRAP_DATASETS = bootstrap_datasets_cfgnodes
|
||||
|
||||
|
||||
def add_densepose_head_config(cfg: CN):
|
||||
"""
|
||||
Add config for densepose head.
|
||||
"""
|
||||
_C = cfg
|
||||
|
||||
_C.MODEL.DENSEPOSE_ON = True
|
||||
|
||||
_C.MODEL.ROI_DENSEPOSE_HEAD = CN()
|
||||
_C.MODEL.ROI_DENSEPOSE_HEAD.NAME = ""
|
||||
_C.MODEL.ROI_DENSEPOSE_HEAD.NUM_STACKED_CONVS = 8
|
||||
# Number of parts used for point labels
|
||||
_C.MODEL.ROI_DENSEPOSE_HEAD.NUM_PATCHES = 24
|
||||
_C.MODEL.ROI_DENSEPOSE_HEAD.DECONV_KERNEL = 4
|
||||
_C.MODEL.ROI_DENSEPOSE_HEAD.CONV_HEAD_DIM = 512
|
||||
_C.MODEL.ROI_DENSEPOSE_HEAD.CONV_HEAD_KERNEL = 3
|
||||
_C.MODEL.ROI_DENSEPOSE_HEAD.UP_SCALE = 2
|
||||
_C.MODEL.ROI_DENSEPOSE_HEAD.HEATMAP_SIZE = 112
|
||||
_C.MODEL.ROI_DENSEPOSE_HEAD.POOLER_TYPE = "ROIAlignV2"
|
||||
_C.MODEL.ROI_DENSEPOSE_HEAD.POOLER_RESOLUTION = 28
|
||||
_C.MODEL.ROI_DENSEPOSE_HEAD.POOLER_SAMPLING_RATIO = 2
|
||||
_C.MODEL.ROI_DENSEPOSE_HEAD.NUM_COARSE_SEGM_CHANNELS = 2 # 15 or 2
|
||||
# Overlap threshold for an RoI to be considered foreground (if >= FG_IOU_THRESHOLD)
|
||||
_C.MODEL.ROI_DENSEPOSE_HEAD.FG_IOU_THRESHOLD = 0.7
|
||||
# Loss weights for annotation masks.(14 Parts)
|
||||
_C.MODEL.ROI_DENSEPOSE_HEAD.INDEX_WEIGHTS = 5.0
|
||||
# Loss weights for surface parts. (24 Parts)
|
||||
_C.MODEL.ROI_DENSEPOSE_HEAD.PART_WEIGHTS = 1.0
|
||||
# Loss weights for UV regression.
|
||||
_C.MODEL.ROI_DENSEPOSE_HEAD.POINT_REGRESSION_WEIGHTS = 0.01
|
||||
# Coarse segmentation is trained using instance segmentation task data
|
||||
_C.MODEL.ROI_DENSEPOSE_HEAD.COARSE_SEGM_TRAINED_BY_MASKS = False
|
||||
# For Decoder
|
||||
_C.MODEL.ROI_DENSEPOSE_HEAD.DECODER_ON = True
|
||||
_C.MODEL.ROI_DENSEPOSE_HEAD.DECODER_NUM_CLASSES = 256
|
||||
_C.MODEL.ROI_DENSEPOSE_HEAD.DECODER_CONV_DIMS = 256
|
||||
_C.MODEL.ROI_DENSEPOSE_HEAD.DECODER_NORM = ""
|
||||
_C.MODEL.ROI_DENSEPOSE_HEAD.DECODER_COMMON_STRIDE = 4
|
||||
# For DeepLab head
|
||||
_C.MODEL.ROI_DENSEPOSE_HEAD.DEEPLAB = CN()
|
||||
_C.MODEL.ROI_DENSEPOSE_HEAD.DEEPLAB.NORM = "GN"
|
||||
_C.MODEL.ROI_DENSEPOSE_HEAD.DEEPLAB.NONLOCAL_ON = 0
|
||||
# Confidences
|
||||
# Enable learning UV confidences (variances) along with the actual values
|
||||
_C.MODEL.ROI_DENSEPOSE_HEAD.UV_CONFIDENCE = CN({"ENABLED": False})
|
||||
# UV confidence lower bound
|
||||
_C.MODEL.ROI_DENSEPOSE_HEAD.UV_CONFIDENCE.EPSILON = 0.01
|
||||
# Enable learning segmentation confidences (variances) along with the actual values
|
||||
_C.MODEL.ROI_DENSEPOSE_HEAD.SEGM_CONFIDENCE = CN({"ENABLED": False})
|
||||
# Segmentation confidence lower bound
|
||||
_C.MODEL.ROI_DENSEPOSE_HEAD.SEGM_CONFIDENCE.EPSILON = 0.01
|
||||
# Statistical model type for confidence learning, possible values:
|
||||
# - "iid_iso": statistically independent identically distributed residuals
|
||||
# with isotropic covariance
|
||||
# - "indep_aniso": statistically independent residuals with anisotropic
|
||||
# covariances
|
||||
_C.MODEL.ROI_DENSEPOSE_HEAD.UV_CONFIDENCE.TYPE = "iid_iso"
|
||||
# List of angles for rotation in data augmentation during training
|
||||
_C.INPUT.ROTATION_ANGLES = [0]
|
||||
_C.TEST.AUG.ROTATION_ANGLES = () # Rotation TTA
|
||||
|
||||
|
||||
def add_hrnet_config(cfg: CN):
|
||||
"""
|
||||
Add config for HRNet backbone.
|
||||
"""
|
||||
_C = cfg
|
||||
|
||||
# For HigherHRNet w32
|
||||
_C.MODEL.HRNET = CN()
|
||||
_C.MODEL.HRNET.STEM_INPLANES = 64
|
||||
_C.MODEL.HRNET.STAGE2 = CN()
|
||||
_C.MODEL.HRNET.STAGE2.NUM_MODULES = 1
|
||||
_C.MODEL.HRNET.STAGE2.NUM_BRANCHES = 2
|
||||
_C.MODEL.HRNET.STAGE2.BLOCK = "BASIC"
|
||||
_C.MODEL.HRNET.STAGE2.NUM_BLOCKS = [4, 4]
|
||||
_C.MODEL.HRNET.STAGE2.NUM_CHANNELS = [32, 64]
|
||||
_C.MODEL.HRNET.STAGE2.FUSE_METHOD = "SUM"
|
||||
_C.MODEL.HRNET.STAGE3 = CN()
|
||||
_C.MODEL.HRNET.STAGE3.NUM_MODULES = 4
|
||||
_C.MODEL.HRNET.STAGE3.NUM_BRANCHES = 3
|
||||
_C.MODEL.HRNET.STAGE3.BLOCK = "BASIC"
|
||||
_C.MODEL.HRNET.STAGE3.NUM_BLOCKS = [4, 4, 4]
|
||||
_C.MODEL.HRNET.STAGE3.NUM_CHANNELS = [32, 64, 128]
|
||||
_C.MODEL.HRNET.STAGE3.FUSE_METHOD = "SUM"
|
||||
_C.MODEL.HRNET.STAGE4 = CN()
|
||||
_C.MODEL.HRNET.STAGE4.NUM_MODULES = 3
|
||||
_C.MODEL.HRNET.STAGE4.NUM_BRANCHES = 4
|
||||
_C.MODEL.HRNET.STAGE4.BLOCK = "BASIC"
|
||||
_C.MODEL.HRNET.STAGE4.NUM_BLOCKS = [4, 4, 4, 4]
|
||||
_C.MODEL.HRNET.STAGE4.NUM_CHANNELS = [32, 64, 128, 256]
|
||||
_C.MODEL.HRNET.STAGE4.FUSE_METHOD = "SUM"
|
||||
|
||||
_C.MODEL.HRNET.HRFPN = CN()
|
||||
_C.MODEL.HRNET.HRFPN.OUT_CHANNELS = 256
|
||||
|
||||
|
||||
def add_densepose_config(cfg: CN):
|
||||
add_densepose_head_config(cfg)
|
||||
add_hrnet_config(cfg)
|
||||
add_bootstrap_config(cfg)
|
||||
add_dataset_category_config(cfg)
|
|
@ -0,0 +1,23 @@
|
|||
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
|
||||
|
||||
from .build import (
|
||||
build_detection_test_loader,
|
||||
build_detection_train_loader,
|
||||
build_combined_loader,
|
||||
build_frame_selector,
|
||||
build_inference_based_loaders,
|
||||
has_inference_based_loaders,
|
||||
BootstrapDatasetFactoryCatalog,
|
||||
)
|
||||
from .combined_loader import CombinedDataLoader
|
||||
from .dataset_mapper import DatasetMapper
|
||||
from .inference_based_loader import InferenceBasedLoader, ScoreBasedFilter
|
||||
from .utils import is_relative_local_path, maybe_prepend_base_path
|
||||
|
||||
# ensure the builtin datasets are registered
|
||||
from . import datasets
|
||||
|
||||
# ensure the bootstrap datasets builders are registered
|
||||
from . import build
|
||||
|
||||
__all__ = [k for k in globals().keys() if not k.startswith("_")]
|
|
@ -0,0 +1,604 @@
|
|||
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
|
||||
|
||||
import itertools
|
||||
import logging
|
||||
import numpy as np
|
||||
from collections import UserDict
|
||||
from typing import Any, Callable, Collection, Dict, Iterable, List, Optional, Sequence
|
||||
import torch
|
||||
from torch.utils.data.dataset import Dataset
|
||||
|
||||
from detectron2.config import CfgNode
|
||||
from detectron2.data.build import (
|
||||
build_batch_data_loader,
|
||||
load_proposals_into_dataset,
|
||||
print_instances_class_histogram,
|
||||
trivial_batch_collator,
|
||||
)
|
||||
from detectron2.data.catalog import DatasetCatalog, Metadata, MetadataCatalog
|
||||
from detectron2.data.common import DatasetFromList, MapDataset
|
||||
from detectron2.data.samplers import InferenceSampler, RepeatFactorTrainingSampler, TrainingSampler
|
||||
from detectron2.utils.comm import get_world_size
|
||||
|
||||
from densepose.config import get_bootstrap_dataset_config
|
||||
|
||||
from .combined_loader import CombinedDataLoader, Loader
|
||||
from .dataset_mapper import DatasetMapper
|
||||
from .datasets.coco import DENSEPOSE_KEYS_WITHOUT_MASK as DENSEPOSE_COCO_KEYS_WITHOUT_MASK
|
||||
from .datasets.coco import DENSEPOSE_MASK_KEY as DENSEPOSE_COCO_MASK_KEY
|
||||
from .datasets.dataset_type import DatasetType
|
||||
from .inference_based_loader import InferenceBasedLoader, ScoreBasedFilter
|
||||
from .samplers import (
|
||||
DensePoseConfidenceBasedSampler,
|
||||
DensePoseUniformSampler,
|
||||
MaskFromDensePoseSampler,
|
||||
PredictionToGroundTruthSampler,
|
||||
)
|
||||
from .transform import ImageResizeTransform
|
||||
from .video import (
|
||||
FirstKFramesSelector,
|
||||
FrameSelectionStrategy,
|
||||
LastKFramesSelector,
|
||||
RandomKFramesSelector,
|
||||
VideoKeyframeDataset,
|
||||
video_list_from_file,
|
||||
)
|
||||
|
||||
__all__ = ["build_detection_train_loader", "build_detection_test_loader"]
|
||||
|
||||
|
||||
Instance = Dict[str, Any]
|
||||
InstancePredicate = Callable[[Instance], bool]
|
||||
|
||||
|
||||
def _compute_num_images_per_worker(cfg: CfgNode):
|
||||
num_workers = get_world_size()
|
||||
images_per_batch = cfg.SOLVER.IMS_PER_BATCH
|
||||
assert (
|
||||
images_per_batch % num_workers == 0
|
||||
), "SOLVER.IMS_PER_BATCH ({}) must be divisible by the number of workers ({}).".format(
|
||||
images_per_batch, num_workers
|
||||
)
|
||||
assert (
|
||||
images_per_batch >= num_workers
|
||||
), "SOLVER.IMS_PER_BATCH ({}) must be larger than the number of workers ({}).".format(
|
||||
images_per_batch, num_workers
|
||||
)
|
||||
images_per_worker = images_per_batch // num_workers
|
||||
return images_per_worker
|
||||
|
||||
|
||||
def _map_category_id_to_contiguous_id(dataset_name: str, dataset_dicts: Iterable[Instance]):
|
||||
meta = MetadataCatalog.get(dataset_name)
|
||||
for dataset_dict in dataset_dicts:
|
||||
for ann in dataset_dict["annotations"]:
|
||||
ann["category_id"] = meta.thing_dataset_id_to_contiguous_id[ann["category_id"]]
|
||||
|
||||
|
||||
def _add_category_id_to_contiguous_id_maps_to_metadata(dataset_names: Iterable[str]):
|
||||
# merge categories for all datasets
|
||||
merged_categories = {}
|
||||
for dataset_name in dataset_names:
|
||||
meta = MetadataCatalog.get(dataset_name)
|
||||
for cat_id, cat_name in meta.categories.items():
|
||||
if cat_id not in merged_categories:
|
||||
merged_categories[cat_id] = (cat_name, dataset_name)
|
||||
continue
|
||||
cat_name_other, dataset_name_other = merged_categories[cat_id]
|
||||
if cat_name_other != cat_name:
|
||||
raise ValueError(
|
||||
f"Incompatible categories for category ID {cat_id}: "
|
||||
f'dataset {dataset_name} value "{cat_name}", '
|
||||
f'dataset {dataset_name_other} value "{cat_name_other}"'
|
||||
)
|
||||
|
||||
merged_cat_id_to_cont_id = {}
|
||||
for i, cat_id in enumerate(sorted(merged_categories.keys())):
|
||||
merged_cat_id_to_cont_id[cat_id] = i
|
||||
|
||||
# add category maps to metadata
|
||||
for dataset_name in dataset_names:
|
||||
meta = MetadataCatalog.get(dataset_name)
|
||||
categories = meta.get("categories")
|
||||
meta.thing_classes = [categories[cat_id] for cat_id in sorted(categories.keys())]
|
||||
meta.thing_dataset_id_to_contiguous_id = {
|
||||
cat_id: merged_cat_id_to_cont_id[cat_id] for cat_id in sorted(categories.keys())
|
||||
}
|
||||
meta.thing_contiguous_id_to_dataset_id = {
|
||||
merged_cat_id_to_cont_id[cat_id]: cat_id for cat_id in sorted(categories.keys())
|
||||
}
|
||||
|
||||
|
||||
def _maybe_create_general_keep_instance_predicate(cfg: CfgNode) -> Optional[InstancePredicate]:
|
||||
def has_annotations(instance: Instance) -> bool:
|
||||
return "annotations" in instance
|
||||
|
||||
def has_only_crowd_anotations(instance: Instance) -> bool:
|
||||
for ann in instance["annotations"]:
|
||||
if ann.get("is_crowd", 0) == 0:
|
||||
return False
|
||||
return True
|
||||
|
||||
def general_keep_instance_predicate(instance: Instance) -> bool:
|
||||
return has_annotations(instance) and not has_only_crowd_anotations(instance)
|
||||
|
||||
if not cfg.DATALOADER.FILTER_EMPTY_ANNOTATIONS:
|
||||
return None
|
||||
return general_keep_instance_predicate
|
||||
|
||||
|
||||
def _maybe_create_keypoints_keep_instance_predicate(cfg: CfgNode) -> Optional[InstancePredicate]:
|
||||
|
||||
min_num_keypoints = cfg.MODEL.ROI_KEYPOINT_HEAD.MIN_KEYPOINTS_PER_IMAGE
|
||||
|
||||
def has_sufficient_num_keypoints(instance: Instance) -> bool:
|
||||
num_kpts = sum(
|
||||
(np.array(ann["keypoints"][2::3]) > 0).sum()
|
||||
for ann in instance["annotations"]
|
||||
if "keypoints" in ann
|
||||
)
|
||||
return num_kpts >= min_num_keypoints
|
||||
|
||||
if cfg.MODEL.KEYPOINT_ON and (min_num_keypoints > 0):
|
||||
return has_sufficient_num_keypoints
|
||||
return None
|
||||
|
||||
|
||||
def _maybe_create_mask_keep_instance_predicate(cfg: CfgNode) -> Optional[InstancePredicate]:
|
||||
if not cfg.MODEL.MASK_ON:
|
||||
return None
|
||||
|
||||
def has_mask_annotations(instance: Instance) -> bool:
|
||||
return any("segmentation" in ann for ann in instance["annotations"])
|
||||
|
||||
return has_mask_annotations
|
||||
|
||||
|
||||
def _maybe_create_densepose_keep_instance_predicate(cfg: CfgNode) -> Optional[InstancePredicate]:
|
||||
if not cfg.MODEL.DENSEPOSE_ON:
|
||||
return None
|
||||
|
||||
use_masks = cfg.MODEL.ROI_DENSEPOSE_HEAD.COARSE_SEGM_TRAINED_BY_MASKS
|
||||
|
||||
def has_densepose_annotations(instance: Instance) -> bool:
|
||||
for ann in instance["annotations"]:
|
||||
if all(key in ann for key in DENSEPOSE_COCO_KEYS_WITHOUT_MASK) and (
|
||||
(DENSEPOSE_COCO_MASK_KEY in ann) or ("segmentation" in ann)
|
||||
):
|
||||
return True
|
||||
if use_masks and "segmentation" in ann:
|
||||
return True
|
||||
return False
|
||||
|
||||
return has_densepose_annotations
|
||||
|
||||
|
||||
def _maybe_create_specific_keep_instance_predicate(cfg: CfgNode) -> Optional[InstancePredicate]:
|
||||
specific_predicate_creators = [
|
||||
_maybe_create_keypoints_keep_instance_predicate,
|
||||
_maybe_create_mask_keep_instance_predicate,
|
||||
_maybe_create_densepose_keep_instance_predicate,
|
||||
]
|
||||
predicates = [creator(cfg) for creator in specific_predicate_creators]
|
||||
predicates = [p for p in predicates if p is not None]
|
||||
if not predicates:
|
||||
return None
|
||||
|
||||
def combined_predicate(instance: Instance) -> bool:
|
||||
return any(p(instance) for p in predicates)
|
||||
|
||||
return combined_predicate
|
||||
|
||||
|
||||
def _get_train_keep_instance_predicate(cfg: CfgNode):
|
||||
general_keep_predicate = _maybe_create_general_keep_instance_predicate(cfg)
|
||||
combined_specific_keep_predicate = _maybe_create_specific_keep_instance_predicate(cfg)
|
||||
|
||||
def combined_general_specific_keep_predicate(instance: Instance) -> bool:
|
||||
return general_keep_predicate(instance) and combined_specific_keep_predicate(instance)
|
||||
|
||||
if (general_keep_predicate is None) and (combined_specific_keep_predicate is None):
|
||||
return None
|
||||
if general_keep_predicate is None:
|
||||
return combined_specific_keep_predicate
|
||||
if combined_specific_keep_predicate is None:
|
||||
return general_keep_predicate
|
||||
return combined_general_specific_keep_predicate
|
||||
|
||||
|
||||
def _get_test_keep_instance_predicate(cfg: CfgNode):
|
||||
general_keep_predicate = _maybe_create_general_keep_instance_predicate(cfg)
|
||||
return general_keep_predicate
|
||||
|
||||
|
||||
def _maybe_filter_and_map_categories(
|
||||
dataset_name: str, dataset_dicts: List[Instance]
|
||||
) -> List[Instance]:
|
||||
meta = MetadataCatalog.get(dataset_name)
|
||||
whitelisted_categories = meta.get("whitelisted_categories")
|
||||
category_map = meta.get("category_map", {})
|
||||
if whitelisted_categories is None and not category_map:
|
||||
return dataset_dicts
|
||||
filtered_dataset_dicts = []
|
||||
for dataset_dict in dataset_dicts:
|
||||
anns = []
|
||||
for ann in dataset_dict["annotations"]:
|
||||
cat_id = ann["category_id"]
|
||||
if whitelisted_categories is not None and cat_id not in whitelisted_categories:
|
||||
continue
|
||||
ann["category_id"] = category_map.get(cat_id, cat_id)
|
||||
anns.append(ann)
|
||||
dataset_dict["annotations"] = anns
|
||||
filtered_dataset_dicts.append(dataset_dict)
|
||||
return filtered_dataset_dicts
|
||||
|
||||
|
||||
def _add_category_whitelists_to_metadata(cfg: CfgNode):
|
||||
for dataset_name, whitelisted_cat_ids in cfg.DATASETS.WHITELISTED_CATEGORIES.items():
|
||||
meta = MetadataCatalog.get(dataset_name)
|
||||
meta.whitelisted_categories = whitelisted_cat_ids
|
||||
logger = logging.getLogger(__name__)
|
||||
logger.info(
|
||||
"Whitelisted categories for dataset {}: {}".format(
|
||||
dataset_name, meta.whitelisted_categories
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
def _add_category_maps_to_metadata(cfg: CfgNode):
|
||||
for dataset_name, category_map in cfg.DATASETS.CATEGORY_MAPS.items():
|
||||
category_map = {
|
||||
int(cat_id_src): int(cat_id_dst) for cat_id_src, cat_id_dst in category_map.items()
|
||||
}
|
||||
meta = MetadataCatalog.get(dataset_name)
|
||||
meta.category_map = category_map
|
||||
logger = logging.getLogger(__name__)
|
||||
logger.info("Category maps for dataset {}: {}".format(dataset_name, meta.category_map))
|
||||
|
||||
|
||||
def combine_detection_dataset_dicts(
|
||||
dataset_names: Collection[str],
|
||||
keep_instance_predicate: Optional[InstancePredicate] = None,
|
||||
proposal_files: Optional[Collection[str]] = None,
|
||||
) -> List[Instance]:
|
||||
"""
|
||||
Load and prepare dataset dicts for training / testing
|
||||
|
||||
Args:
|
||||
dataset_names (Collection[str]): a list of dataset names
|
||||
keep_instance_predicate (Callable: Dict[str, Any] -> bool): predicate
|
||||
applied to instance dicts which defines whether to keep the instance
|
||||
proposal_files (Collection[str]): if given, a list of object proposal files
|
||||
that match each dataset in `dataset_names`.
|
||||
"""
|
||||
assert len(dataset_names)
|
||||
if proposal_files is None:
|
||||
proposal_files = [None] * len(dataset_names)
|
||||
assert len(dataset_names) == len(proposal_files)
|
||||
# load annotations and dataset metadata
|
||||
dataset_map = {}
|
||||
for dataset_name in dataset_names:
|
||||
dataset_dicts = DatasetCatalog.get(dataset_name)
|
||||
dataset_map[dataset_name] = dataset_dicts
|
||||
# initialize category maps
|
||||
_add_category_id_to_contiguous_id_maps_to_metadata(dataset_names)
|
||||
# apply category maps
|
||||
all_datasets_dicts = []
|
||||
for dataset_name, proposal_file in zip(dataset_names, proposal_files):
|
||||
dataset_dicts = dataset_map[dataset_name]
|
||||
assert len(dataset_dicts), f"Dataset '{dataset_name}' is empty!"
|
||||
if proposal_file is not None:
|
||||
dataset_dicts = load_proposals_into_dataset(dataset_dicts, proposal_file)
|
||||
dataset_dicts = _maybe_filter_and_map_categories(dataset_name, dataset_dicts)
|
||||
_map_category_id_to_contiguous_id(dataset_name, dataset_dicts)
|
||||
print_instances_class_histogram(
|
||||
dataset_dicts, MetadataCatalog.get(dataset_name).thing_classes
|
||||
)
|
||||
all_datasets_dicts.append(dataset_dicts)
|
||||
|
||||
if keep_instance_predicate is not None:
|
||||
all_datasets_dicts_plain = [
|
||||
d
|
||||
for d in itertools.chain.from_iterable(all_datasets_dicts)
|
||||
if keep_instance_predicate(d)
|
||||
]
|
||||
else:
|
||||
all_datasets_dicts_plain = list(itertools.chain.from_iterable(all_datasets_dicts))
|
||||
return all_datasets_dicts_plain
|
||||
|
||||
|
||||
def build_detection_train_loader(cfg: CfgNode, mapper=None):
|
||||
"""
|
||||
A data loader is created in a way similar to that of Detectron2.
|
||||
The main differences are:
|
||||
- it allows to combine datasets with different but compatible object category sets
|
||||
|
||||
The data loader is created by the following steps:
|
||||
1. Use the dataset names in config to query :class:`DatasetCatalog`, and obtain a list of dicts.
|
||||
2. Start workers to work on the dicts. Each worker will:
|
||||
* Map each metadata dict into another format to be consumed by the model.
|
||||
* Batch them by simply putting dicts into a list.
|
||||
The batched ``list[mapped_dict]`` is what this dataloader will return.
|
||||
|
||||
Args:
|
||||
cfg (CfgNode): the config
|
||||
mapper (callable): a callable which takes a sample (dict) from dataset and
|
||||
returns the format to be consumed by the model.
|
||||
By default it will be `DatasetMapper(cfg, True)`.
|
||||
|
||||
Returns:
|
||||
an infinite iterator of training data
|
||||
"""
|
||||
|
||||
_add_category_whitelists_to_metadata(cfg)
|
||||
_add_category_maps_to_metadata(cfg)
|
||||
dataset_dicts = combine_detection_dataset_dicts(
|
||||
cfg.DATASETS.TRAIN,
|
||||
keep_instance_predicate=_get_train_keep_instance_predicate(cfg),
|
||||
proposal_files=cfg.DATASETS.PROPOSAL_FILES_TRAIN if cfg.MODEL.LOAD_PROPOSALS else None,
|
||||
)
|
||||
dataset = DatasetFromList(dataset_dicts, copy=False)
|
||||
|
||||
if mapper is None:
|
||||
mapper = DatasetMapper(cfg, True)
|
||||
dataset = MapDataset(dataset, mapper)
|
||||
|
||||
sampler_name = cfg.DATALOADER.SAMPLER_TRAIN
|
||||
logger = logging.getLogger(__name__)
|
||||
logger.info("Using training sampler {}".format(sampler_name))
|
||||
if sampler_name == "TrainingSampler":
|
||||
sampler = TrainingSampler(len(dataset))
|
||||
elif sampler_name == "RepeatFactorTrainingSampler":
|
||||
repeat_factors = RepeatFactorTrainingSampler.repeat_factors_from_category_frequency(
|
||||
dataset_dicts, cfg.DATALOADER.REPEAT_THRESHOLD
|
||||
)
|
||||
sampler = RepeatFactorTrainingSampler(repeat_factors)
|
||||
else:
|
||||
raise ValueError("Unknown training sampler: {}".format(sampler_name))
|
||||
|
||||
return build_batch_data_loader(
|
||||
dataset,
|
||||
sampler,
|
||||
cfg.SOLVER.IMS_PER_BATCH,
|
||||
aspect_ratio_grouping=cfg.DATALOADER.ASPECT_RATIO_GROUPING,
|
||||
num_workers=cfg.DATALOADER.NUM_WORKERS,
|
||||
)
|
||||
|
||||
|
||||
def build_detection_test_loader(cfg, dataset_name, mapper=None):
|
||||
"""
|
||||
Similar to `build_detection_train_loader`.
|
||||
But this function uses the given `dataset_name` argument (instead of the names in cfg),
|
||||
and uses batch size 1.
|
||||
|
||||
Args:
|
||||
cfg: a detectron2 CfgNode
|
||||
dataset_name (str): a name of the dataset that's available in the DatasetCatalog
|
||||
mapper (callable): a callable which takes a sample (dict) from dataset
|
||||
and returns the format to be consumed by the model.
|
||||
By default it will be `DatasetMapper(cfg, False)`.
|
||||
|
||||
Returns:
|
||||
DataLoader: a torch DataLoader, that loads the given detection
|
||||
dataset, with test-time transformation and batching.
|
||||
"""
|
||||
_add_category_whitelists_to_metadata(cfg)
|
||||
_add_category_maps_to_metadata(cfg)
|
||||
dataset_dicts = combine_detection_dataset_dicts(
|
||||
[dataset_name],
|
||||
keep_instance_predicate=_get_test_keep_instance_predicate(cfg),
|
||||
proposal_files=[
|
||||
cfg.DATASETS.PROPOSAL_FILES_TEST[list(cfg.DATASETS.TEST).index(dataset_name)]
|
||||
]
|
||||
if cfg.MODEL.LOAD_PROPOSALS
|
||||
else None,
|
||||
)
|
||||
|
||||
dataset = DatasetFromList(dataset_dicts)
|
||||
if mapper is None:
|
||||
mapper = DatasetMapper(cfg, False)
|
||||
dataset = MapDataset(dataset, mapper)
|
||||
|
||||
sampler = InferenceSampler(len(dataset))
|
||||
# Always use 1 image per worker during inference since this is the
|
||||
# standard when reporting inference time in papers.
|
||||
batch_sampler = torch.utils.data.sampler.BatchSampler(sampler, 1, drop_last=False)
|
||||
|
||||
data_loader = torch.utils.data.DataLoader(
|
||||
dataset,
|
||||
num_workers=cfg.DATALOADER.NUM_WORKERS,
|
||||
batch_sampler=batch_sampler,
|
||||
collate_fn=trivial_batch_collator,
|
||||
)
|
||||
return data_loader
|
||||
|
||||
|
||||
def build_frame_selector(cfg: CfgNode):
|
||||
strategy = FrameSelectionStrategy(cfg.STRATEGY)
|
||||
if strategy == FrameSelectionStrategy.RANDOM_K:
|
||||
frame_selector = RandomKFramesSelector(cfg.NUM_IMAGES)
|
||||
elif strategy == FrameSelectionStrategy.FIRST_K:
|
||||
frame_selector = FirstKFramesSelector(cfg.NUM_IMAGES)
|
||||
elif strategy == FrameSelectionStrategy.LAST_K:
|
||||
frame_selector = LastKFramesSelector(cfg.NUM_IMAGES)
|
||||
elif strategy == FrameSelectionStrategy.ALL:
|
||||
frame_selector = None
|
||||
return frame_selector
|
||||
|
||||
|
||||
def build_transform(cfg: CfgNode, data_type: str):
|
||||
if cfg.TYPE == "resize":
|
||||
if data_type == "image":
|
||||
return ImageResizeTransform(cfg.MIN_SIZE, cfg.MAX_SIZE)
|
||||
raise ValueError(f"Unknown transform {cfg.TYPE} for data type {data_type}")
|
||||
|
||||
|
||||
def build_combined_loader(cfg: CfgNode, loaders: Collection[Loader], ratios: Sequence[float]):
|
||||
images_per_worker = _compute_num_images_per_worker(cfg)
|
||||
return CombinedDataLoader(loaders, images_per_worker, ratios)
|
||||
|
||||
|
||||
def build_bootstrap_dataset(dataset_name: str, cfg: CfgNode) -> Sequence[torch.Tensor]:
|
||||
"""
|
||||
Build dataset that provides data to bootstrap on
|
||||
|
||||
Args:
|
||||
dataset_name (str): Name of the dataset, needs to have associated metadata
|
||||
to load the data
|
||||
cfg (CfgNode): bootstrapping config
|
||||
Returns:
|
||||
Sequence[Tensor] - dataset that provides image batches, Tensors of size
|
||||
[N, C, H, W] of type float32
|
||||
"""
|
||||
logger = logging.getLogger(__name__)
|
||||
meta = MetadataCatalog.get(dataset_name)
|
||||
factory = BootstrapDatasetFactoryCatalog.get(meta.dataset_type)
|
||||
dataset = None
|
||||
if factory is not None:
|
||||
dataset = factory(meta, cfg)
|
||||
if dataset is None:
|
||||
logger.warning(f"Failed to create dataset {dataset_name} of type {meta.dataset_type}")
|
||||
return dataset
|
||||
|
||||
|
||||
def build_data_sampler(cfg: CfgNode):
|
||||
if cfg.TYPE == "densepose_uniform":
|
||||
data_sampler = PredictionToGroundTruthSampler()
|
||||
# transform densepose pred -> gt
|
||||
data_sampler.register_sampler(
|
||||
"pred_densepose",
|
||||
"gt_densepose",
|
||||
DensePoseUniformSampler(count_per_class=cfg.COUNT_PER_CLASS),
|
||||
)
|
||||
data_sampler.register_sampler("pred_densepose", "gt_masks", MaskFromDensePoseSampler())
|
||||
return data_sampler
|
||||
elif cfg.TYPE == "densepose_UV_confidence":
|
||||
data_sampler = PredictionToGroundTruthSampler()
|
||||
# transform densepose pred -> gt
|
||||
data_sampler.register_sampler(
|
||||
"pred_densepose",
|
||||
"gt_densepose",
|
||||
DensePoseConfidenceBasedSampler(
|
||||
confidence_channel="sigma_2",
|
||||
count_per_class=cfg.COUNT_PER_CLASS,
|
||||
search_proportion=0.5,
|
||||
),
|
||||
)
|
||||
data_sampler.register_sampler("pred_densepose", "gt_masks", MaskFromDensePoseSampler())
|
||||
return data_sampler
|
||||
elif cfg.TYPE == "densepose_fine_segm_confidence":
|
||||
data_sampler = PredictionToGroundTruthSampler()
|
||||
# transform densepose pred -> gt
|
||||
data_sampler.register_sampler(
|
||||
"pred_densepose",
|
||||
"gt_densepose",
|
||||
DensePoseConfidenceBasedSampler(
|
||||
confidence_channel="fine_segm_confidence",
|
||||
count_per_class=cfg.COUNT_PER_CLASS,
|
||||
search_proportion=0.5,
|
||||
),
|
||||
)
|
||||
data_sampler.register_sampler("pred_densepose", "gt_masks", MaskFromDensePoseSampler())
|
||||
return data_sampler
|
||||
elif cfg.TYPE == "densepose_coarse_segm_confidence":
|
||||
data_sampler = PredictionToGroundTruthSampler()
|
||||
# transform densepose pred -> gt
|
||||
data_sampler.register_sampler(
|
||||
"pred_densepose",
|
||||
"gt_densepose",
|
||||
DensePoseConfidenceBasedSampler(
|
||||
confidence_channel="coarse_segm_confidence",
|
||||
count_per_class=cfg.COUNT_PER_CLASS,
|
||||
search_proportion=0.5,
|
||||
),
|
||||
)
|
||||
data_sampler.register_sampler("pred_densepose", "gt_masks", MaskFromDensePoseSampler())
|
||||
return data_sampler
|
||||
|
||||
raise ValueError(f"Unknown data sampler type {cfg.TYPE}")
|
||||
|
||||
|
||||
def build_data_filter(cfg: CfgNode):
|
||||
if cfg.TYPE == "detection_score":
|
||||
min_score = cfg.MIN_VALUE
|
||||
return ScoreBasedFilter(min_score=min_score)
|
||||
raise ValueError(f"Unknown data filter type {cfg.TYPE}")
|
||||
|
||||
|
||||
def build_inference_based_loader(
|
||||
cfg: CfgNode, dataset_cfg: CfgNode, model: torch.nn.Module
|
||||
) -> InferenceBasedLoader:
|
||||
"""
|
||||
Constructs data loader based on inference results of a model.
|
||||
"""
|
||||
dataset = build_bootstrap_dataset(dataset_cfg.DATASET, dataset_cfg.IMAGE_LOADER)
|
||||
training_sampler = TrainingSampler(len(dataset))
|
||||
data_loader = torch.utils.data.DataLoader(
|
||||
dataset,
|
||||
batch_size=dataset_cfg.IMAGE_LOADER.BATCH_SIZE,
|
||||
sampler=training_sampler,
|
||||
num_workers=dataset_cfg.IMAGE_LOADER.NUM_WORKERS,
|
||||
collate_fn=trivial_batch_collator,
|
||||
)
|
||||
return InferenceBasedLoader(
|
||||
model,
|
||||
data_loader=data_loader,
|
||||
data_sampler=build_data_sampler(dataset_cfg.DATA_SAMPLER),
|
||||
data_filter=build_data_filter(dataset_cfg.FILTER),
|
||||
shuffle=True,
|
||||
batch_size=dataset_cfg.INFERENCE.OUTPUT_BATCH_SIZE,
|
||||
inference_batch_size=dataset_cfg.INFERENCE.INPUT_BATCH_SIZE,
|
||||
)
|
||||
|
||||
|
||||
def has_inference_based_loaders(cfg: CfgNode) -> bool:
|
||||
"""
|
||||
Returns True, if at least one inferense-based loader must
|
||||
be instantiated for training
|
||||
"""
|
||||
return len(cfg.BOOTSTRAP_DATASETS) > 0
|
||||
|
||||
|
||||
def build_inference_based_loaders(
|
||||
cfg: CfgNode, model: torch.nn.Module
|
||||
) -> List[InferenceBasedLoader]:
|
||||
loaders = []
|
||||
ratios = []
|
||||
for dataset_spec in cfg.BOOTSTRAP_DATASETS:
|
||||
dataset_cfg = get_bootstrap_dataset_config().clone()
|
||||
dataset_cfg.merge_from_other_cfg(CfgNode(dataset_spec))
|
||||
loader = build_inference_based_loader(cfg, dataset_cfg, model)
|
||||
loaders.append(loader)
|
||||
ratios.append(dataset_cfg.RATIO)
|
||||
return loaders, ratios
|
||||
|
||||
|
||||
def build_video_list_dataset(meta: Metadata, cfg: CfgNode):
|
||||
video_list_fpath = meta.video_list_fpath
|
||||
video_base_path = meta.video_base_path
|
||||
if cfg.TYPE == "video_keyframe":
|
||||
frame_selector = build_frame_selector(cfg.SELECT)
|
||||
transform = build_transform(cfg.TRANSFORM, data_type="image")
|
||||
video_list = video_list_from_file(video_list_fpath, video_base_path)
|
||||
return VideoKeyframeDataset(video_list, frame_selector, transform)
|
||||
|
||||
|
||||
class _BootstrapDatasetFactoryCatalog(UserDict):
|
||||
"""
|
||||
A global dictionary that stores information about bootstrapped datasets creation functions
|
||||
from metadata and config, for diverse DatasetType
|
||||
"""
|
||||
|
||||
def register(self, dataset_type: DatasetType, factory: Callable[[Metadata, CfgNode], Dataset]):
|
||||
"""
|
||||
Args:
|
||||
dataset_type (DatasetType): a DatasetType e.g. DatasetType.VIDEO_LIST
|
||||
factory (Callable[Metadata, CfgNode]): a callable which takes Metadata and cfg
|
||||
arguments and returns a dataset object.
|
||||
"""
|
||||
assert dataset_type not in self, "Dataset '{}' is already registered!".format(dataset_type)
|
||||
self[dataset_type] = factory
|
||||
|
||||
|
||||
BootstrapDatasetFactoryCatalog = _BootstrapDatasetFactoryCatalog()
|
||||
BootstrapDatasetFactoryCatalog.register(DatasetType.VIDEO_LIST, build_video_list_dataset)
|
|
@ -0,0 +1,44 @@
|
|||
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
|
||||
|
||||
import random
|
||||
from collections import deque
|
||||
from typing import Any, Collection, Deque, Iterable, Iterator, List, Sequence
|
||||
|
||||
Loader = Iterable[Any]
|
||||
|
||||
|
||||
def _pooled_next(iterator: Iterator[Any], pool: Deque[Any]):
|
||||
if not pool:
|
||||
pool.extend(next(iterator))
|
||||
return pool.popleft()
|
||||
|
||||
|
||||
class CombinedDataLoader:
|
||||
"""
|
||||
Combines data loaders using the provided sampling ratios
|
||||
"""
|
||||
|
||||
BATCH_COUNT = 100
|
||||
|
||||
def __init__(self, loaders: Collection[Loader], batch_size: int, ratios: Sequence[float]):
|
||||
self.loaders = loaders
|
||||
self.batch_size = batch_size
|
||||
self.ratios = ratios
|
||||
|
||||
def __iter__(self) -> Iterator[List[Any]]:
|
||||
iters = [iter(loader) for loader in self.loaders]
|
||||
indices = []
|
||||
pool = [deque()] * len(iters)
|
||||
# infinite iterator, as in D2
|
||||
while True:
|
||||
if not indices:
|
||||
# just a buffer of indices, its size doesn't matter
|
||||
# as long as it's a multiple of batch_size
|
||||
k = self.batch_size * self.BATCH_COUNT
|
||||
indices = random.choices(range(len(self.loaders)), self.ratios, k=k)
|
||||
try:
|
||||
batch = [_pooled_next(iters[i], pool[i]) for i in indices[: self.batch_size]]
|
||||
except StopIteration:
|
||||
break
|
||||
indices = indices[self.batch_size :]
|
||||
yield batch
|
|
@ -0,0 +1,168 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
|
||||
|
||||
import copy
|
||||
import logging
|
||||
from typing import Any, Dict, Tuple
|
||||
import torch
|
||||
from fvcore.common.file_io import PathManager
|
||||
|
||||
from detectron2.data import MetadataCatalog
|
||||
from detectron2.data import detection_utils as utils
|
||||
from detectron2.data import transforms as T
|
||||
from detectron2.layers import ROIAlign
|
||||
from detectron2.structures import BoxMode
|
||||
|
||||
from .structures import DensePoseDataRelative, DensePoseList, DensePoseTransformData
|
||||
|
||||
|
||||
def build_augmentation(cfg, is_train):
|
||||
logger = logging.getLogger(__name__)
|
||||
result = utils.build_augmentation(cfg, is_train)
|
||||
if is_train:
|
||||
random_rotation = T.RandomRotation(
|
||||
cfg.INPUT.ROTATION_ANGLES, expand=False, sample_style="choice"
|
||||
)
|
||||
result.append(random_rotation)
|
||||
logger.info("DensePose-specific augmentation used in training: " + str(random_rotation))
|
||||
return result
|
||||
|
||||
|
||||
class DatasetMapper:
|
||||
"""
|
||||
A customized version of `detectron2.data.DatasetMapper`
|
||||
"""
|
||||
|
||||
def __init__(self, cfg, is_train=True):
|
||||
self.augmentation = build_augmentation(cfg, is_train)
|
||||
|
||||
# fmt: off
|
||||
self.img_format = cfg.INPUT.FORMAT
|
||||
self.mask_on = (
|
||||
cfg.MODEL.MASK_ON or (
|
||||
cfg.MODEL.DENSEPOSE_ON
|
||||
and cfg.MODEL.ROI_DENSEPOSE_HEAD.COARSE_SEGM_TRAINED_BY_MASKS)
|
||||
)
|
||||
self.keypoint_on = cfg.MODEL.KEYPOINT_ON
|
||||
self.densepose_on = cfg.MODEL.DENSEPOSE_ON
|
||||
assert not cfg.MODEL.LOAD_PROPOSALS, "not supported yet"
|
||||
# fmt: on
|
||||
if self.keypoint_on and is_train:
|
||||
# Flip only makes sense in training
|
||||
self.keypoint_hflip_indices = utils.create_keypoint_hflip_indices(cfg.DATASETS.TRAIN)
|
||||
else:
|
||||
self.keypoint_hflip_indices = None
|
||||
|
||||
if self.densepose_on:
|
||||
densepose_transform_srcs = [
|
||||
MetadataCatalog.get(ds).densepose_transform_src
|
||||
for ds in cfg.DATASETS.TRAIN + cfg.DATASETS.TEST
|
||||
]
|
||||
assert len(densepose_transform_srcs) > 0
|
||||
# TODO: check that DensePose transformation data is the same for
|
||||
# all the datasets. Otherwise one would have to pass DB ID with
|
||||
# each entry to select proper transformation data. For now, since
|
||||
# all DensePose annotated data uses the same data semantics, we
|
||||
# omit this check.
|
||||
densepose_transform_data_fpath = PathManager.get_local_path(densepose_transform_srcs[0])
|
||||
self.densepose_transform_data = DensePoseTransformData.load(
|
||||
densepose_transform_data_fpath
|
||||
)
|
||||
|
||||
self.is_train = is_train
|
||||
|
||||
def __call__(self, dataset_dict):
|
||||
"""
|
||||
Args:
|
||||
dataset_dict (dict): Metadata of one image, in Detectron2 Dataset format.
|
||||
|
||||
Returns:
|
||||
dict: a format that builtin models in detectron2 accept
|
||||
"""
|
||||
dataset_dict = copy.deepcopy(dataset_dict) # it will be modified by code below
|
||||
image = utils.read_image(dataset_dict["file_name"], format=self.img_format)
|
||||
utils.check_image_size(dataset_dict, image)
|
||||
|
||||
image, transforms = T.apply_transform_gens(self.augmentation, image)
|
||||
image_shape = image.shape[:2] # h, w
|
||||
dataset_dict["image"] = torch.as_tensor(image.transpose(2, 0, 1).astype("float32"))
|
||||
|
||||
if not self.is_train:
|
||||
dataset_dict.pop("annotations", None)
|
||||
return dataset_dict
|
||||
|
||||
for anno in dataset_dict["annotations"]:
|
||||
if not self.mask_on:
|
||||
anno.pop("segmentation", None)
|
||||
if not self.keypoint_on:
|
||||
anno.pop("keypoints", None)
|
||||
|
||||
# USER: Implement additional transformations if you have other types of data
|
||||
# USER: Don't call transpose_densepose if you don't need
|
||||
annos = [
|
||||
self._transform_densepose(
|
||||
utils.transform_instance_annotations(
|
||||
obj, transforms, image_shape, keypoint_hflip_indices=self.keypoint_hflip_indices
|
||||
),
|
||||
transforms,
|
||||
)
|
||||
for obj in dataset_dict.pop("annotations")
|
||||
if obj.get("iscrowd", 0) == 0
|
||||
]
|
||||
|
||||
if self.mask_on:
|
||||
self._add_densepose_masks_as_segmentation(annos, image_shape)
|
||||
|
||||
instances = utils.annotations_to_instances(annos, image_shape, mask_format="bitmask")
|
||||
densepose_annotations = [obj.get("densepose") for obj in annos]
|
||||
if densepose_annotations and not all(v is None for v in densepose_annotations):
|
||||
instances.gt_densepose = DensePoseList(
|
||||
densepose_annotations, instances.gt_boxes, image_shape
|
||||
)
|
||||
|
||||
dataset_dict["instances"] = instances[instances.gt_boxes.nonempty()]
|
||||
return dataset_dict
|
||||
|
||||
def _transform_densepose(self, annotation, transforms):
|
||||
if not self.densepose_on:
|
||||
return annotation
|
||||
|
||||
# Handle densepose annotations
|
||||
is_valid, reason_not_valid = DensePoseDataRelative.validate_annotation(annotation)
|
||||
if is_valid:
|
||||
densepose_data = DensePoseDataRelative(annotation, cleanup=True)
|
||||
densepose_data.apply_transform(transforms, self.densepose_transform_data)
|
||||
annotation["densepose"] = densepose_data
|
||||
else:
|
||||
# logger = logging.getLogger(__name__)
|
||||
# logger.debug("Could not load DensePose annotation: {}".format(reason_not_valid))
|
||||
DensePoseDataRelative.cleanup_annotation(annotation)
|
||||
# NOTE: annotations for certain instances may be unavailable.
|
||||
# 'None' is accepted by the DensePostList data structure.
|
||||
annotation["densepose"] = None
|
||||
return annotation
|
||||
|
||||
def _add_densepose_masks_as_segmentation(
|
||||
self, annotations: Dict[str, Any], image_shape_hw: Tuple[int, int]
|
||||
):
|
||||
for obj in annotations:
|
||||
if ("densepose" not in obj) or ("segmentation" in obj):
|
||||
continue
|
||||
# DP segmentation: torch.Tensor [S, S] of float32, S=256
|
||||
segm_dp = torch.zeros_like(obj["densepose"].segm)
|
||||
segm_dp[obj["densepose"].segm > 0] = 1
|
||||
segm_h, segm_w = segm_dp.shape
|
||||
bbox_segm_dp = torch.tensor((0, 0, segm_h - 1, segm_w - 1), dtype=torch.float32)
|
||||
# image bbox
|
||||
x0, y0, x1, y1 = (
|
||||
v.item() for v in BoxMode.convert(obj["bbox"], obj["bbox_mode"], BoxMode.XYXY_ABS)
|
||||
)
|
||||
segm_aligned = (
|
||||
ROIAlign((y1 - y0, x1 - x0), 1.0, 0, aligned=True)
|
||||
.forward(segm_dp.view(1, 1, *segm_dp.shape), bbox_segm_dp)
|
||||
.squeeze()
|
||||
)
|
||||
image_mask = torch.zeros(*image_shape_hw, dtype=torch.float32)
|
||||
image_mask[y0:y1, x0:x1] = segm_aligned
|
||||
# segmentation for BitMask: np.array [H, W] of np.bool
|
||||
obj["segmentation"] = image_mask >= 0.5
|
|
@ -0,0 +1,5 @@
|
|||
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
|
||||
|
||||
from . import builtin # ensure the builtin datasets are registered
|
||||
|
||||
__all__ = [k for k in globals().keys() if "builtin" not in k and not k.startswith("_")]
|
|
@ -0,0 +1,13 @@
|
|||
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
|
||||
from .chimpnsee import register_dataset as register_chimpnsee_dataset
|
||||
from .coco import BASE_DATASETS as BASE_COCO_DATASETS
|
||||
from .coco import DATASETS as COCO_DATASETS
|
||||
from .coco import register_datasets as register_coco_datasets
|
||||
|
||||
DEFAULT_DATASETS_ROOT = "datasets"
|
||||
|
||||
|
||||
register_coco_datasets(COCO_DATASETS, DEFAULT_DATASETS_ROOT)
|
||||
register_coco_datasets(BASE_COCO_DATASETS, DEFAULT_DATASETS_ROOT)
|
||||
|
||||
register_chimpnsee_dataset(DEFAULT_DATASETS_ROOT)
|
|
@ -0,0 +1,28 @@
|
|||
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
|
||||
|
||||
import os
|
||||
from typing import Optional
|
||||
|
||||
from detectron2.data import DatasetCatalog, MetadataCatalog
|
||||
|
||||
from ..utils import maybe_prepend_base_path
|
||||
from .dataset_type import DatasetType
|
||||
|
||||
CHIMPNSEE_DATASET_NAME = "chimpnsee"
|
||||
|
||||
|
||||
def register_dataset(datasets_root: Optional[os.PathLike] = None):
|
||||
def empty_load_callback():
|
||||
pass
|
||||
|
||||
video_list_fpath = maybe_prepend_base_path(
|
||||
datasets_root, "chimpnsee/cdna.eva.mpg.de/video_list.txt"
|
||||
)
|
||||
video_base_path = maybe_prepend_base_path(datasets_root, "chimpnsee/cdna.eva.mpg.de")
|
||||
|
||||
DatasetCatalog.register(CHIMPNSEE_DATASET_NAME, empty_load_callback)
|
||||
MetadataCatalog.get(CHIMPNSEE_DATASET_NAME).set(
|
||||
dataset_type=DatasetType.VIDEO_LIST,
|
||||
video_list_fpath=video_list_fpath,
|
||||
video_base_path=video_base_path,
|
||||
)
|
|
@ -0,0 +1,324 @@
|
|||
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
|
||||
import contextlib
|
||||
import io
|
||||
import logging
|
||||
import os
|
||||
from collections import defaultdict
|
||||
from dataclasses import dataclass
|
||||
from typing import Any, Dict, Iterable, List, Optional
|
||||
from fvcore.common.file_io import PathManager
|
||||
from fvcore.common.timer import Timer
|
||||
|
||||
from detectron2.data import DatasetCatalog, MetadataCatalog
|
||||
from detectron2.structures import BoxMode
|
||||
|
||||
from ..utils import maybe_prepend_base_path
|
||||
|
||||
DENSEPOSE_MASK_KEY = "dp_masks"
|
||||
DENSEPOSE_KEYS_WITHOUT_MASK = ["dp_x", "dp_y", "dp_I", "dp_U", "dp_V"]
|
||||
DENSEPOSE_KEYS = DENSEPOSE_KEYS_WITHOUT_MASK + [DENSEPOSE_MASK_KEY]
|
||||
DENSEPOSE_METADATA_URL_PREFIX = "https://dl.fbaipublicfiles.com/densepose/data/"
|
||||
|
||||
|
||||
@dataclass
|
||||
class CocoDatasetInfo:
|
||||
name: str
|
||||
images_root: str
|
||||
annotations_fpath: str
|
||||
|
||||
|
||||
DATASETS = [
|
||||
CocoDatasetInfo(
|
||||
name="densepose_coco_2014_train",
|
||||
images_root="coco/train2014",
|
||||
annotations_fpath="coco/annotations/densepose_train2014.json",
|
||||
),
|
||||
CocoDatasetInfo(
|
||||
name="densepose_coco_2014_minival",
|
||||
images_root="coco/val2014",
|
||||
annotations_fpath="coco/annotations/densepose_minival2014.json",
|
||||
),
|
||||
CocoDatasetInfo(
|
||||
name="densepose_coco_2014_minival_100",
|
||||
images_root="coco/val2014",
|
||||
annotations_fpath="coco/annotations/densepose_minival2014_100.json",
|
||||
),
|
||||
CocoDatasetInfo(
|
||||
name="densepose_coco_2014_valminusminival",
|
||||
images_root="coco/val2014",
|
||||
annotations_fpath="coco/annotations/densepose_valminusminival2014.json",
|
||||
),
|
||||
CocoDatasetInfo(
|
||||
name="densepose_chimps",
|
||||
images_root="densepose_evolution/densepose_chimps",
|
||||
annotations_fpath="densepose_evolution/annotations/densepose_chimps_densepose.json",
|
||||
),
|
||||
CocoDatasetInfo(
|
||||
name="posetrack2017_train",
|
||||
images_root="posetrack2017/posetrack_data_2017",
|
||||
annotations_fpath="posetrack2017/densepose_posetrack_train2017.json",
|
||||
),
|
||||
CocoDatasetInfo(
|
||||
name="posetrack2017_val",
|
||||
images_root="posetrack2017/posetrack_data_2017",
|
||||
annotations_fpath="posetrack2017/densepose_posetrack_val2017.json",
|
||||
),
|
||||
]
|
||||
|
||||
|
||||
BASE_DATASETS = [
|
||||
CocoDatasetInfo(
|
||||
name="base_coco_2017_train",
|
||||
images_root="coco/train2017",
|
||||
annotations_fpath="coco/annotations/instances_train2017.json",
|
||||
),
|
||||
CocoDatasetInfo(
|
||||
name="base_coco_2017_val",
|
||||
images_root="coco/val2017",
|
||||
annotations_fpath="coco/annotations/instances_val2017.json",
|
||||
),
|
||||
CocoDatasetInfo(
|
||||
name="base_coco_2017_val_100",
|
||||
images_root="coco/val2017",
|
||||
annotations_fpath="coco/annotations/instances_val2017_100.json",
|
||||
),
|
||||
]
|
||||
|
||||
|
||||
def get_metadata(base_path: Optional[os.PathLike]) -> Dict[str, Any]:
|
||||
"""
|
||||
Returns metadata associated with COCO DensePose datasets
|
||||
|
||||
Args:
|
||||
base_path: Optional[os.PathLike]
|
||||
Base path used to load metadata from
|
||||
|
||||
Returns:
|
||||
Dict[str, Any]
|
||||
Metadata in the form of a dictionary
|
||||
"""
|
||||
meta = {
|
||||
"densepose_transform_src": maybe_prepend_base_path(base_path, "UV_symmetry_transforms.mat"),
|
||||
"densepose_smpl_subdiv": maybe_prepend_base_path(base_path, "SMPL_subdiv.mat"),
|
||||
"densepose_smpl_subdiv_transform": maybe_prepend_base_path(
|
||||
base_path, "SMPL_SUBDIV_TRANSFORM.mat"
|
||||
),
|
||||
}
|
||||
return meta
|
||||
|
||||
|
||||
def _load_coco_annotations(json_file: str):
|
||||
"""
|
||||
Load COCO annotations from a JSON file
|
||||
|
||||
Args:
|
||||
json_file: str
|
||||
Path to the file to load annotations from
|
||||
Returns:
|
||||
Instance of `pycocotools.coco.COCO` that provides access to annotations
|
||||
data
|
||||
"""
|
||||
from pycocotools.coco import COCO
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
timer = Timer()
|
||||
with contextlib.redirect_stdout(io.StringIO()):
|
||||
coco_api = COCO(json_file)
|
||||
if timer.seconds() > 1:
|
||||
logger.info("Loading {} takes {:.2f} seconds.".format(json_file, timer.seconds()))
|
||||
return coco_api
|
||||
|
||||
|
||||
def _add_categories_metadata(dataset_name: str, categories: Dict[str, Any]):
|
||||
meta = MetadataCatalog.get(dataset_name)
|
||||
meta.categories = {c["id"]: c["name"] for c in categories}
|
||||
logger = logging.getLogger(__name__)
|
||||
logger.info("Dataset {} categories: {}".format(dataset_name, categories))
|
||||
|
||||
|
||||
def _verify_annotations_have_unique_ids(json_file: str, anns: List[List[Dict[str, Any]]]):
|
||||
if "minival" in json_file:
|
||||
# Skip validation on COCO2014 valminusminival and minival annotations
|
||||
# The ratio of buggy annotations there is tiny and does not affect accuracy
|
||||
# Therefore we explicitly white-list them
|
||||
return
|
||||
ann_ids = [ann["id"] for anns_per_image in anns for ann in anns_per_image]
|
||||
assert len(set(ann_ids)) == len(ann_ids), "Annotation ids in '{}' are not unique!".format(
|
||||
json_file
|
||||
)
|
||||
|
||||
|
||||
def _maybe_add_bbox(obj: Dict[str, Any], ann_dict: Dict[str, Any]):
|
||||
if "bbox" not in ann_dict:
|
||||
return
|
||||
obj["bbox"] = ann_dict["bbox"]
|
||||
obj["bbox_mode"] = BoxMode.XYWH_ABS
|
||||
|
||||
|
||||
def _maybe_add_segm(obj: Dict[str, Any], ann_dict: Dict[str, Any]):
|
||||
if "segmentation" not in ann_dict:
|
||||
return
|
||||
segm = ann_dict["segmentation"]
|
||||
if not isinstance(segm, dict):
|
||||
# filter out invalid polygons (< 3 points)
|
||||
segm = [poly for poly in segm if len(poly) % 2 == 0 and len(poly) >= 6]
|
||||
if len(segm) == 0:
|
||||
return
|
||||
obj["segmentation"] = segm
|
||||
|
||||
|
||||
def _maybe_add_keypoints(obj: Dict[str, Any], ann_dict: Dict[str, Any]):
|
||||
if "keypoints" not in ann_dict:
|
||||
return
|
||||
keypts = ann_dict["keypoints"] # list[int]
|
||||
for idx, v in enumerate(keypts):
|
||||
if idx % 3 != 2:
|
||||
# COCO's segmentation coordinates are floating points in [0, H or W],
|
||||
# but keypoint coordinates are integers in [0, H-1 or W-1]
|
||||
# Therefore we assume the coordinates are "pixel indices" and
|
||||
# add 0.5 to convert to floating point coordinates.
|
||||
keypts[idx] = v + 0.5
|
||||
obj["keypoints"] = keypts
|
||||
|
||||
|
||||
def _maybe_add_densepose(obj: Dict[str, Any], ann_dict: Dict[str, Any]):
|
||||
for key in DENSEPOSE_KEYS:
|
||||
if key in ann_dict:
|
||||
obj[key] = ann_dict[key]
|
||||
|
||||
|
||||
def _combine_images_with_annotations(
|
||||
dataset_name: str,
|
||||
image_root: str,
|
||||
img_datas: Iterable[Dict[str, Any]],
|
||||
ann_datas: Iterable[Iterable[Dict[str, Any]]],
|
||||
):
|
||||
|
||||
ann_keys = ["iscrowd", "category_id"]
|
||||
dataset_dicts = []
|
||||
contains_video_frame_info = False
|
||||
|
||||
for img_dict, ann_dicts in zip(img_datas, ann_datas):
|
||||
record = {}
|
||||
record["file_name"] = os.path.join(image_root, img_dict["file_name"])
|
||||
record["height"] = img_dict["height"]
|
||||
record["width"] = img_dict["width"]
|
||||
record["image_id"] = img_dict["id"]
|
||||
record["dataset"] = dataset_name
|
||||
if "frame_id" in img_dict:
|
||||
record["frame_id"] = img_dict["frame_id"]
|
||||
record["video_id"] = img_dict.get("vid_id", None)
|
||||
contains_video_frame_info = True
|
||||
objs = []
|
||||
for ann_dict in ann_dicts:
|
||||
assert ann_dict["image_id"] == record["image_id"]
|
||||
assert ann_dict.get("ignore", 0) == 0
|
||||
obj = {key: ann_dict[key] for key in ann_keys if key in ann_dict}
|
||||
_maybe_add_bbox(obj, ann_dict)
|
||||
_maybe_add_segm(obj, ann_dict)
|
||||
_maybe_add_keypoints(obj, ann_dict)
|
||||
_maybe_add_densepose(obj, ann_dict)
|
||||
objs.append(obj)
|
||||
record["annotations"] = objs
|
||||
dataset_dicts.append(record)
|
||||
if contains_video_frame_info:
|
||||
create_video_frame_mapping(dataset_name, dataset_dicts)
|
||||
return dataset_dicts
|
||||
|
||||
|
||||
def create_video_frame_mapping(dataset_name, dataset_dicts):
|
||||
mapping = defaultdict(dict)
|
||||
for d in dataset_dicts:
|
||||
video_id = d.get("video_id")
|
||||
if video_id is None:
|
||||
continue
|
||||
mapping[video_id].update({d["frame_id"]: d["file_name"]})
|
||||
MetadataCatalog.get(dataset_name).set(video_frame_mapping=mapping)
|
||||
|
||||
|
||||
def load_coco_json(annotations_json_file: str, image_root: str, dataset_name: str):
|
||||
"""
|
||||
Loads a JSON file with annotations in COCO instances format.
|
||||
Replaces `detectron2.data.datasets.coco.load_coco_json` to handle metadata
|
||||
in a more flexible way. Postpones category mapping to a later stage to be
|
||||
able to combine several datasets with different (but coherent) sets of
|
||||
categories.
|
||||
|
||||
Args:
|
||||
|
||||
annotations_json_file: str
|
||||
Path to the JSON file with annotations in COCO instances format.
|
||||
image_root: str
|
||||
directory that contains all the images
|
||||
dataset_name: str
|
||||
the name that identifies a dataset, e.g. "densepose_coco_2014_train"
|
||||
extra_annotation_keys: Optional[List[str]]
|
||||
If provided, these keys are used to extract additional data from
|
||||
the annotations.
|
||||
"""
|
||||
coco_api = _load_coco_annotations(PathManager.get_local_path(annotations_json_file))
|
||||
_add_categories_metadata(dataset_name, coco_api.loadCats(coco_api.getCatIds()))
|
||||
# sort indices for reproducible results
|
||||
img_ids = sorted(coco_api.imgs.keys())
|
||||
# imgs is a list of dicts, each looks something like:
|
||||
# {'license': 4,
|
||||
# 'url': 'http://farm6.staticflickr.com/5454/9413846304_881d5e5c3b_z.jpg',
|
||||
# 'file_name': 'COCO_val2014_000000001268.jpg',
|
||||
# 'height': 427,
|
||||
# 'width': 640,
|
||||
# 'date_captured': '2013-11-17 05:57:24',
|
||||
# 'id': 1268}
|
||||
imgs = coco_api.loadImgs(img_ids)
|
||||
logger = logging.getLogger(__name__)
|
||||
logger.info("Loaded {} images in COCO format from {}".format(len(imgs), annotations_json_file))
|
||||
# anns is a list[list[dict]], where each dict is an annotation
|
||||
# record for an object. The inner list enumerates the objects in an image
|
||||
# and the outer list enumerates over images.
|
||||
anns = [coco_api.imgToAnns[img_id] for img_id in img_ids]
|
||||
_verify_annotations_have_unique_ids(annotations_json_file, anns)
|
||||
dataset_records = _combine_images_with_annotations(dataset_name, image_root, imgs, anns)
|
||||
return dataset_records
|
||||
|
||||
|
||||
def register_dataset(dataset_data: CocoDatasetInfo, datasets_root: Optional[os.PathLike] = None):
|
||||
"""
|
||||
Registers provided COCO DensePose dataset
|
||||
|
||||
Args:
|
||||
dataset_data: CocoDatasetInfo
|
||||
Dataset data
|
||||
datasets_root: Optional[os.PathLike]
|
||||
Datasets root folder (default: None)
|
||||
"""
|
||||
annotations_fpath = maybe_prepend_base_path(datasets_root, dataset_data.annotations_fpath)
|
||||
images_root = maybe_prepend_base_path(datasets_root, dataset_data.images_root)
|
||||
|
||||
def load_annotations():
|
||||
return load_coco_json(
|
||||
annotations_json_file=annotations_fpath,
|
||||
image_root=images_root,
|
||||
dataset_name=dataset_data.name,
|
||||
)
|
||||
|
||||
DatasetCatalog.register(dataset_data.name, load_annotations)
|
||||
MetadataCatalog.get(dataset_data.name).set(
|
||||
json_file=annotations_fpath,
|
||||
image_root=images_root,
|
||||
**get_metadata(DENSEPOSE_METADATA_URL_PREFIX)
|
||||
)
|
||||
|
||||
|
||||
def register_datasets(
|
||||
datasets_data: Iterable[CocoDatasetInfo], datasets_root: Optional[os.PathLike] = None
|
||||
):
|
||||
"""
|
||||
Registers provided COCO DensePose datasets
|
||||
|
||||
Args:
|
||||
datasets_data: Iterable[CocoDatasetInfo]
|
||||
An iterable of dataset datas
|
||||
datasets_root: Optional[os.PathLike]
|
||||
Datasets root folder (default: None)
|
||||
"""
|
||||
for dataset_data in datasets_data:
|
||||
register_dataset(dataset_data, datasets_root)
|
|
@ -0,0 +1,11 @@
|
|||
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
|
||||
|
||||
from enum import Enum
|
||||
|
||||
|
||||
class DatasetType(Enum):
|
||||
"""
|
||||
Dataset type, mostly used for datasets that contain data to bootstrap models on
|
||||
"""
|
||||
|
||||
VIDEO_LIST = "video_list"
|
|
@ -0,0 +1,53 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
|
||||
|
||||
import logging
|
||||
import numpy as np
|
||||
from typing import Callable, List, Optional
|
||||
import torch
|
||||
from torch.utils.data.dataset import Dataset
|
||||
|
||||
from detectron2.data.detection_utils import read_image
|
||||
|
||||
ImageTransform = Callable[[torch.Tensor], torch.Tensor]
|
||||
|
||||
|
||||
class ImageListDataset(Dataset):
|
||||
"""
|
||||
Dataset that provides images from a list.
|
||||
"""
|
||||
|
||||
_EMPTY_IMAGE = torch.empty((1, 1, 3))
|
||||
|
||||
def __init__(self, image_list: List[str], transform: Optional[ImageTransform] = None):
|
||||
"""
|
||||
Args:
|
||||
image_list (List[str]): list of paths to image files
|
||||
"""
|
||||
self.image_list = image_list
|
||||
self.transform = transform
|
||||
|
||||
def __getitem__(self, idx: int) -> torch.Tensor:
|
||||
"""
|
||||
Gets selected images from the list
|
||||
|
||||
Args:
|
||||
idx (int): video index in the video list file
|
||||
Returns:
|
||||
image (torch.Tensor): tensor of size [H, W, 3]
|
||||
"""
|
||||
fpath = self.image_list[idx]
|
||||
|
||||
try:
|
||||
image = torch.from_numpy(np.ascontiguousarray(read_image(fpath, format="BGR")))
|
||||
if self.transform is not None:
|
||||
image = self.transform(image.unsqueeze(0))[0] # Transforms are done on batches
|
||||
return image
|
||||
except (OSError, RuntimeError) as e:
|
||||
logger = logging.getLogger(__name__)
|
||||
logger.warning(f"Error opening image file container {fpath}: {e}")
|
||||
|
||||
return self._EMPTY_IMAGE
|
||||
|
||||
def __len__(self):
|
||||
return len(self.image_list)
|
|
@ -0,0 +1,146 @@
|
|||
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
|
||||
|
||||
import random
|
||||
from typing import Any, Callable, Iterable, Iterator, List, Optional, Tuple
|
||||
import torch
|
||||
from torch import nn
|
||||
|
||||
SampledData = Any
|
||||
ModelOutput = Any
|
||||
|
||||
|
||||
def _grouper(iterable: Iterable[Any], n: int, fillvalue=None) -> Iterator[Tuple[Any]]:
|
||||
"""
|
||||
Group elements of an iterable by chunks of size `n`, e.g.
|
||||
grouper(range(9), 4) ->
|
||||
(0, 1, 2, 3), (4, 5, 6, 7), (8, None, None, None)
|
||||
"""
|
||||
it = iter(iterable)
|
||||
while True:
|
||||
values = []
|
||||
for _ in range(n):
|
||||
try:
|
||||
value = next(it)
|
||||
except StopIteration:
|
||||
if values:
|
||||
values.extend([fillvalue] * (n - len(values)))
|
||||
yield tuple(values)
|
||||
return
|
||||
values.append(value)
|
||||
yield tuple(values)
|
||||
|
||||
|
||||
class ScoreBasedFilter:
|
||||
"""
|
||||
Filters entries in model output based on their scores
|
||||
Discards all entries with score less than the specified minimum
|
||||
"""
|
||||
|
||||
def __init__(self, min_score: float = 0.8):
|
||||
self.min_score = min_score
|
||||
|
||||
def __call__(self, model_output: ModelOutput) -> ModelOutput:
|
||||
for model_output_i in model_output:
|
||||
instances = model_output_i["instances"]
|
||||
if not instances.has("scores"):
|
||||
continue
|
||||
instances_filtered = instances[instances.scores >= self.min_score]
|
||||
model_output_i["instances"] = instances_filtered
|
||||
return model_output
|
||||
|
||||
|
||||
class InferenceBasedLoader:
|
||||
"""
|
||||
Data loader based on results inferred by a model. Consists of:
|
||||
- a data loader that provides batches of images
|
||||
- a model that is used to infer the results
|
||||
- a data sampler that converts inferred results to annotations
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
model: nn.Module,
|
||||
data_loader: Iterable[List[torch.Tensor]],
|
||||
data_sampler: Optional[Callable[[ModelOutput], List[SampledData]]] = None,
|
||||
data_filter: Optional[Callable[[ModelOutput], ModelOutput]] = None,
|
||||
shuffle: bool = True,
|
||||
batch_size: int = 4,
|
||||
inference_batch_size: int = 4,
|
||||
drop_last: bool = False,
|
||||
):
|
||||
"""
|
||||
Constructor
|
||||
|
||||
Args:
|
||||
model (torch.nn.Module): model used to produce data
|
||||
data_loader (Iterable[Tensor]): iterable that provides images
|
||||
to perform inference on
|
||||
data_sampler (Callable: ModelOutput -> SampledData): functor
|
||||
that produces annotation data from inference results;
|
||||
(optional, default: None)
|
||||
data_filter (Callable: ModelOutput -> ModelOutput): filter
|
||||
that selects model outputs for for further processing
|
||||
(optional, default: None)
|
||||
shuffle (bool): if True, the input images get shuffled
|
||||
batch_size (int): batch size for the produced annotation data
|
||||
inference_batch_size (int): batch size for input images
|
||||
drop_last (bool): if True, drop the last batch if it is undersized
|
||||
"""
|
||||
self.model = model
|
||||
self.model.eval()
|
||||
self.data_loader = data_loader
|
||||
self.data_sampler = data_sampler
|
||||
self.data_filter = data_filter
|
||||
self.shuffle = shuffle
|
||||
self.batch_size = batch_size
|
||||
self.inference_batch_size = inference_batch_size
|
||||
self.drop_last = drop_last
|
||||
|
||||
def __iter__(self) -> Iterator[List[SampledData]]:
|
||||
for batch in self.data_loader:
|
||||
# batch : List[Tensor[N, C, H, W]]
|
||||
# images_batch : Tensor[N, C, H, W]
|
||||
# image : Tensor[C, H, W]
|
||||
images = [image for images_batch in batch for image in images_batch]
|
||||
if not images:
|
||||
continue
|
||||
if self.shuffle:
|
||||
random.shuffle(images)
|
||||
yield from self._produce_data(images)
|
||||
|
||||
def _produce_data(self, images: List[torch.Tensor]) -> Iterator[List[SampledData]]:
|
||||
"""
|
||||
Produce batches of data from images
|
||||
|
||||
Args:
|
||||
images (List[Tensor]): list of images to process
|
||||
|
||||
Returns:
|
||||
Iterator over batches of data sampled from model outputs
|
||||
"""
|
||||
data_batches: List[SampledData] = []
|
||||
batched_images = _grouper(images, self.inference_batch_size)
|
||||
for batch in batched_images:
|
||||
batch = [{"image": img.to(self.model.device)} for img in batch if img is not None]
|
||||
if not batch:
|
||||
continue
|
||||
with torch.no_grad():
|
||||
model_output = self.model(batch)
|
||||
for model_output_i, batch_i in zip(model_output, batch):
|
||||
model_output_i["image"] = batch_i["image"]
|
||||
model_output_filtered = (
|
||||
model_output if self.data_filter is None else self.data_filter(model_output)
|
||||
)
|
||||
data = (
|
||||
model_output_filtered
|
||||
if self.data_sampler is None
|
||||
else self.data_sampler(model_output_filtered)
|
||||
)
|
||||
for data_i in data:
|
||||
if len(data_i["instances"]):
|
||||
data_batches.append(data_i)
|
||||
if len(data_batches) >= self.batch_size:
|
||||
yield data_batches[: self.batch_size]
|
||||
data_batches = data_batches[self.batch_size :]
|
||||
if not self.drop_last and data_batches:
|
||||
yield data_batches
|
|
@ -0,0 +1,6 @@
|
|||
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
|
||||
|
||||
from .densepose_uniform import DensePoseUniformSampler
|
||||
from .densepose_confidence_based import DensePoseConfidenceBasedSampler
|
||||
from .mask_from_densepose import MaskFromDensePoseSampler, densepose_to_mask
|
||||
from .prediction_to_gt import PredictionToGroundTruthSampler
|
|
@ -0,0 +1,190 @@
|
|||
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
|
||||
|
||||
from typing import List, Optional
|
||||
import torch
|
||||
from torch.nn import functional as F
|
||||
|
||||
from detectron2.structures import BoxMode, Instances
|
||||
|
||||
from ..structures import (
|
||||
DensePoseDataRelative,
|
||||
DensePoseList,
|
||||
DensePoseOutput,
|
||||
resample_output_to_bbox,
|
||||
)
|
||||
|
||||
|
||||
class DensePoseBaseSampler:
|
||||
"""
|
||||
Base DensePose sampler to produce DensePose data from DensePose predictions.
|
||||
Samples for each class are drawn according to some distribution over all pixels estimated
|
||||
to belong to that class.
|
||||
"""
|
||||
|
||||
def __init__(self, count_per_class: int = 8):
|
||||
"""
|
||||
Constructor
|
||||
|
||||
Args:
|
||||
count_per_class (int): the sampler produces at most `count_per_class`
|
||||
samples for each category
|
||||
"""
|
||||
self.count_per_class = count_per_class
|
||||
|
||||
def __call__(self, instances: Instances) -> DensePoseList:
|
||||
"""
|
||||
Convert DensePose predictions (an instance of `DensePoseOutput`)
|
||||
into DensePose annotations data (an instance of `DensePoseList`)
|
||||
"""
|
||||
boxes_xyxy_abs = instances.pred_boxes.tensor.clone().cpu()
|
||||
boxes_xywh_abs = BoxMode.convert(boxes_xyxy_abs, BoxMode.XYXY_ABS, BoxMode.XYWH_ABS)
|
||||
dp_datas = []
|
||||
for i, box_xywh in enumerate(boxes_xywh_abs):
|
||||
labels_i, result_i = resample_output_to_bbox(
|
||||
instances.pred_densepose[i], box_xywh, self._confidence_channels()
|
||||
)
|
||||
annotation_i = self._sample(labels_i.cpu(), result_i.cpu(), box_xywh)
|
||||
annotation_i[DensePoseDataRelative.S_KEY] = self._resample_mask(
|
||||
instances.pred_densepose[i]
|
||||
)
|
||||
|
||||
dp_datas.append(DensePoseDataRelative(annotation_i))
|
||||
# create densepose annotations on CPU
|
||||
dp_list = DensePoseList(dp_datas, boxes_xyxy_abs, instances.image_size)
|
||||
return dp_list
|
||||
|
||||
def _sample(
|
||||
self, labels: torch.Tensor, dp_result: torch.Tensor, bbox_xywh: List[int]
|
||||
) -> DensePoseDataRelative:
|
||||
"""
|
||||
Sample DensPoseDataRelative from estimation results
|
||||
"""
|
||||
annotation = {
|
||||
DensePoseDataRelative.X_KEY: [],
|
||||
DensePoseDataRelative.Y_KEY: [],
|
||||
DensePoseDataRelative.U_KEY: [],
|
||||
DensePoseDataRelative.V_KEY: [],
|
||||
DensePoseDataRelative.I_KEY: [],
|
||||
}
|
||||
x0, y0, _, _ = bbox_xywh
|
||||
n, h, w = dp_result.shape
|
||||
for part_id in range(1, DensePoseDataRelative.N_PART_LABELS + 1):
|
||||
# indices - tuple of 3 1D tensors of size k
|
||||
# 0: index along the first dimension N
|
||||
# 1: index along H dimension
|
||||
# 2: index along W dimension
|
||||
indices = torch.nonzero(labels.expand(n, h, w) == part_id, as_tuple=True)
|
||||
# values - an array of size [n, k]
|
||||
# n: number of channels (U, V, confidences)
|
||||
# k: number of points labeled with part_id
|
||||
values = dp_result[indices].view(n, -1)
|
||||
k = values.shape[1]
|
||||
count = min(self.count_per_class, k)
|
||||
if count <= 0:
|
||||
continue
|
||||
index_sample = self._produce_index_sample(values, count)
|
||||
sampled_values = values[:, index_sample]
|
||||
sampled_y = indices[1][index_sample] + 0.5
|
||||
sampled_x = indices[2][index_sample] + 0.5
|
||||
# prepare / normalize data
|
||||
x = (sampled_x / w * 256.0).cpu().tolist()
|
||||
y = (sampled_y / h * 256.0).cpu().tolist()
|
||||
u = sampled_values[0].clamp(0, 1).cpu().tolist()
|
||||
v = sampled_values[1].clamp(0, 1).cpu().tolist()
|
||||
fine_segm_labels = [part_id] * count
|
||||
# extend annotations
|
||||
annotation[DensePoseDataRelative.X_KEY].extend(x)
|
||||
annotation[DensePoseDataRelative.Y_KEY].extend(y)
|
||||
annotation[DensePoseDataRelative.U_KEY].extend(u)
|
||||
annotation[DensePoseDataRelative.V_KEY].extend(v)
|
||||
annotation[DensePoseDataRelative.I_KEY].extend(fine_segm_labels)
|
||||
return annotation
|
||||
|
||||
def _confidence_channels(self) -> Optional[List[str]]:
|
||||
"""
|
||||
Confedence channels to be used for sampling (to be overridden in children)
|
||||
"""
|
||||
return None
|
||||
|
||||
def _produce_index_sample(self, values: torch.Tensor, count: int):
|
||||
"""
|
||||
Abstract method to produce a sample of indices to select data
|
||||
To be implemented in descendants
|
||||
|
||||
Args:
|
||||
values (torch.Tensor): an array of size [n, k] that contains
|
||||
estimated values (U, V, confidences);
|
||||
n: number of channels (U, V, confidences)
|
||||
k: number of points labeled with part_id
|
||||
count (int): number of samples to produce, should be positive and <= k
|
||||
:w
|
||||
|
||||
Return:
|
||||
list(int): indices of values (along axis 1) selected as a sample
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
def _resample_mask(self, output: DensePoseOutput) -> torch.Tensor:
|
||||
"""
|
||||
Convert output mask tensors into the annotation mask tensor of size
|
||||
(256, 256)
|
||||
"""
|
||||
sz = DensePoseDataRelative.MASK_SIZE
|
||||
S = (
|
||||
F.interpolate(output.S, (sz, sz), mode="bilinear", align_corners=False)
|
||||
.argmax(dim=1)
|
||||
.long()
|
||||
)
|
||||
I = (
|
||||
(
|
||||
F.interpolate(output.I, (sz, sz), mode="bilinear", align_corners=False).argmax(
|
||||
dim=1
|
||||
)
|
||||
* (S > 0).long()
|
||||
)
|
||||
.squeeze()
|
||||
.cpu()
|
||||
)
|
||||
# Map fine segmentation results to coarse segmentation ground truth
|
||||
# TODO: extract this into separate classes
|
||||
# coarse segmentation: 1 = Torso, 2 = Right Hand, 3 = Left Hand,
|
||||
# 4 = Left Foot, 5 = Right Foot, 6 = Upper Leg Right, 7 = Upper Leg Left,
|
||||
# 8 = Lower Leg Right, 9 = Lower Leg Left, 10 = Upper Arm Left,
|
||||
# 11 = Upper Arm Right, 12 = Lower Arm Left, 13 = Lower Arm Right,
|
||||
# 14 = Head
|
||||
# fine segmentation: 1, 2 = Torso, 3 = Right Hand, 4 = Left Hand,
|
||||
# 5 = Left Foot, 6 = Right Foot, 7, 9 = Upper Leg Right,
|
||||
# 8, 10 = Upper Leg Left, 11, 13 = Lower Leg Right,
|
||||
# 12, 14 = Lower Leg Left, 15, 17 = Upper Arm Left,
|
||||
# 16, 18 = Upper Arm Right, 19, 21 = Lower Arm Left,
|
||||
# 20, 22 = Lower Arm Right, 23, 24 = Head
|
||||
FINE_TO_COARSE_SEGMENTATION = {
|
||||
1: 1,
|
||||
2: 1,
|
||||
3: 2,
|
||||
4: 3,
|
||||
5: 4,
|
||||
6: 5,
|
||||
7: 6,
|
||||
8: 7,
|
||||
9: 6,
|
||||
10: 7,
|
||||
11: 8,
|
||||
12: 9,
|
||||
13: 8,
|
||||
14: 9,
|
||||
15: 10,
|
||||
16: 11,
|
||||
17: 10,
|
||||
18: 11,
|
||||
19: 12,
|
||||
20: 13,
|
||||
21: 12,
|
||||
22: 13,
|
||||
23: 14,
|
||||
24: 14,
|
||||
}
|
||||
mask = torch.zeros((sz, sz), dtype=torch.int64, device=torch.device("cpu"))
|
||||
for i in range(DensePoseDataRelative.N_PART_LABELS):
|
||||
mask[I == i + 1] = FINE_TO_COARSE_SEGMENTATION[i + 1]
|
||||
return mask
|
|
@ -0,0 +1,91 @@
|
|||
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
|
||||
|
||||
import random
|
||||
from typing import List, Optional
|
||||
import torch
|
||||
|
||||
from .densepose_base import DensePoseBaseSampler
|
||||
|
||||
|
||||
class DensePoseConfidenceBasedSampler(DensePoseBaseSampler):
|
||||
"""
|
||||
Samples DensePose data from DensePose predictions.
|
||||
Samples for each class are drawn using confidence value estimates.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
confidence_channel: str,
|
||||
count_per_class: int = 8,
|
||||
search_count_multiplier: Optional[float] = None,
|
||||
search_proportion: Optional[float] = None,
|
||||
):
|
||||
"""
|
||||
Constructor
|
||||
|
||||
Args:
|
||||
confidence_channel (str): confidence channel to use for sampling;
|
||||
possible values:
|
||||
"sigma_2": confidences for UV values
|
||||
"fine_segm_confidence": confidences for fine segmentation
|
||||
"coarse_segm_confidence": confidences for coarse segmentation
|
||||
(default: "sigma_2")
|
||||
count_per_class (int): the sampler produces at most `count_per_class`
|
||||
samples for each category (default: 8)
|
||||
search_count_multiplier (float or None): if not None, the total number
|
||||
of the most confident estimates of a given class to consider is
|
||||
defined as `min(search_count_multiplier * count_per_class, N)`,
|
||||
where `N` is the total number of estimates of the class; cannot be
|
||||
specified together with `search_proportion` (default: None)
|
||||
search_proportion (float or None): if not None, the total number of the
|
||||
of the most confident estimates of a given class to consider is
|
||||
defined as `min(max(search_proportion * N, count_per_class), N)`,
|
||||
where `N` is the total number of estimates of the class; cannot be
|
||||
specified together with `search_count_multiplier` (default: None)
|
||||
"""
|
||||
super().__init__(count_per_class)
|
||||
self.confidence_channel = confidence_channel
|
||||
self.search_count_multiplier = search_count_multiplier
|
||||
self.search_proportion = search_proportion
|
||||
assert (search_count_multiplier is None) or (search_proportion is None), (
|
||||
f"Cannot specify both search_count_multiplier (={search_count_multiplier})"
|
||||
f"and search_proportion (={search_proportion})"
|
||||
)
|
||||
|
||||
def _confidence_channels(self) -> Optional[List[str]]:
|
||||
"""
|
||||
Confedence channels to be used for sampling (to be overridden in children)
|
||||
"""
|
||||
return [self.confidence_channel]
|
||||
|
||||
def _produce_index_sample(self, values: torch.Tensor, count: int):
|
||||
"""
|
||||
Produce a sample of indices to select data based on confidences
|
||||
|
||||
Args:
|
||||
values (torch.Tensor): an array of size [n, k] that contains
|
||||
estimated values (U, V, confidences);
|
||||
n: number of channels (U, V, confidences)
|
||||
k: number of points labeled with part_id
|
||||
count (int): number of samples to produce, should be positive and <= k
|
||||
|
||||
Return:
|
||||
list(int): indices of values (along axis 1) selected as a sample
|
||||
"""
|
||||
k = values.shape[1]
|
||||
if k == count:
|
||||
index_sample = list(range(k))
|
||||
else:
|
||||
# take the best count * search_count_multiplier pixels,
|
||||
# sample from them uniformly
|
||||
# (here best = smallest variance)
|
||||
_, sorted_confidence_indices = torch.sort(values[2])
|
||||
if self.search_count_multiplier is not None:
|
||||
search_count = min(int(count * self.search_count_multiplier), k)
|
||||
elif self.search_proportion is not None:
|
||||
search_count = min(max(int(k * self.search_proportion), count), k)
|
||||
else:
|
||||
search_count = min(count, k)
|
||||
sample_from_top = random.sample(range(search_count), count)
|
||||
index_sample = sorted_confidence_indices[:search_count][sample_from_top]
|
||||
return index_sample
|
|
@ -0,0 +1,41 @@
|
|||
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
|
||||
|
||||
import random
|
||||
import torch
|
||||
|
||||
from .densepose_base import DensePoseBaseSampler
|
||||
|
||||
|
||||
class DensePoseUniformSampler(DensePoseBaseSampler):
|
||||
"""
|
||||
Samples DensePose data from DensePose predictions.
|
||||
Samples for each class are drawn uniformly over all pixels estimated
|
||||
to belong to that class.
|
||||
"""
|
||||
|
||||
def __init__(self, count_per_class: int = 8):
|
||||
"""
|
||||
Constructor
|
||||
|
||||
Args:
|
||||
count_per_class (int): the sampler produces at most `count_per_class`
|
||||
samples for each category
|
||||
"""
|
||||
super().__init__(count_per_class)
|
||||
|
||||
def _produce_index_sample(self, values: torch.Tensor, count: int):
|
||||
"""
|
||||
Produce a uniform sample of indices to select data
|
||||
|
||||
Args:
|
||||
values (torch.Tensor): an array of size [n, k] that contains
|
||||
estimated values (U, V, confidences);
|
||||
n: number of channels (U, V, confidences)
|
||||
k: number of points labeled with part_id
|
||||
count (int): number of samples to produce, should be positive and <= k
|
||||
|
||||
Return:
|
||||
list(int): indices of values (along axis 1) selected as a sample
|
||||
"""
|
||||
k = values.shape[1]
|
||||
return random.sample(range(k), count)
|
|
@ -0,0 +1,59 @@
|
|||
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
|
||||
|
||||
import torch
|
||||
|
||||
from detectron2.structures import BitMasks, BoxMode, Instances
|
||||
|
||||
from ..structures import resample_output_to_bbox
|
||||
|
||||
|
||||
def densepose_to_mask(instances: Instances) -> BitMasks:
|
||||
"""
|
||||
Produce masks from DensePose predictions
|
||||
DensePose predictions for a given image, stored in `pred_densepose` field,
|
||||
are instances of DensePoseOutput. This sampler takes
|
||||
`S` and `I` output tensors (coarse and fine segmentation) and converts
|
||||
then to a mask tensor, which is a bool tensor of the size of the input
|
||||
image
|
||||
|
||||
Args:
|
||||
instances (Instances): predicted results, expected to have `pred_densepose` field
|
||||
that contains `DensePoseOutput` objects
|
||||
|
||||
Returns:
|
||||
`BitMasks` instance with boolean tensors of the size of the input image that have non-zero
|
||||
values at pixels that are estimated to belong to the detected objects
|
||||
"""
|
||||
H, W = instances.image_size
|
||||
boxes_xyxy_abs = instances.pred_boxes.tensor.clone().cpu()
|
||||
boxes_xywh_abs = BoxMode.convert(boxes_xyxy_abs, BoxMode.XYXY_ABS, BoxMode.XYWH_ABS)
|
||||
N = len(boxes_xywh_abs)
|
||||
gt_masks = torch.zeros((N, H, W), dtype=torch.bool, device=torch.device("cpu"))
|
||||
for i, box_xywh in enumerate(boxes_xywh_abs):
|
||||
labels_i, _ = resample_output_to_bbox(instances.pred_densepose[i], box_xywh)
|
||||
x, y, w, h = box_xywh.long().tolist()
|
||||
gt_masks[i, y : y + h, x : x + w] = labels_i.cpu() > 0
|
||||
return BitMasks(gt_masks)
|
||||
|
||||
|
||||
class MaskFromDensePoseSampler:
|
||||
"""
|
||||
Produce mask GT from DensePose predictions
|
||||
DensePose prediction is an instance of DensePoseOutput. This sampler takes
|
||||
`S` and `I` output tensors (coarse and fine segmentation) and converts
|
||||
then to a mask tensor, which is a bool tensor of the size of the input
|
||||
image
|
||||
"""
|
||||
|
||||
def __call__(self, instances: Instances) -> BitMasks:
|
||||
"""
|
||||
Converts predicted data from `instances` into the GT mask data
|
||||
|
||||
Args:
|
||||
instances (Instances): predicted results, expected to have `pred_densepose` field
|
||||
|
||||
Returns:
|
||||
Boolean Tensor of the size of the input image that has non-zero
|
||||
values at pixels that are estimated to belong to the detected object
|
||||
"""
|
||||
return densepose_to_mask(instances)
|
|
@ -0,0 +1,80 @@
|
|||
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
|
||||
|
||||
from dataclasses import dataclass
|
||||
from typing import Any, Callable, Dict, Optional
|
||||
|
||||
from detectron2.structures import Instances
|
||||
|
||||
ModelOutput = Dict[str, Any]
|
||||
SampledData = Dict[str, Any]
|
||||
|
||||
|
||||
@dataclass
|
||||
class _Sampler:
|
||||
"""
|
||||
Sampler registry entry that contains:
|
||||
- src (str): source field to sample from (deleted after sampling)
|
||||
- dst (Optional[str]): destination field to sample to, if not None
|
||||
- func (Optional[Callable: Any -> Any]): function that performs sampling,
|
||||
if None, reference copy is performed
|
||||
"""
|
||||
|
||||
src: str
|
||||
dst: Optional[str]
|
||||
func: Optional[Callable[[Any], Any]]
|
||||
|
||||
|
||||
class PredictionToGroundTruthSampler:
|
||||
"""
|
||||
Sampler implementation that converts predictions to GT using registered
|
||||
samplers for different fields of `Instances`.
|
||||
"""
|
||||
|
||||
def __init__(self, dataset_name: str = ""):
|
||||
self.dataset_name = dataset_name
|
||||
self._samplers = {}
|
||||
self.register_sampler("pred_boxes", "gt_boxes", None)
|
||||
self.register_sampler("pred_classes", "gt_classes", None)
|
||||
self.register_sampler("scores")
|
||||
|
||||
def __call__(self, model_output: ModelOutput) -> SampledData:
|
||||
"""
|
||||
Transform model output into ground truth data through sampling
|
||||
|
||||
Args:
|
||||
model_output (Dict[str, Any]): model output
|
||||
Returns:
|
||||
Dict[str, Any]: sampled data
|
||||
"""
|
||||
for model_output_i in model_output:
|
||||
instances: Instances = model_output_i["instances"]
|
||||
# transform data in each field
|
||||
for _, sampler in self._samplers.items():
|
||||
if not instances.has(sampler.src) or sampler.dst is None:
|
||||
continue
|
||||
if sampler.func is None:
|
||||
instances.set(sampler.dst, instances.get(sampler.src))
|
||||
else:
|
||||
instances.set(sampler.dst, sampler.func(instances))
|
||||
# delete model output data that was transformed
|
||||
for _, sampler in self._samplers.items():
|
||||
if sampler.src != sampler.dst and instances.has(sampler.src):
|
||||
instances.remove(sampler.src)
|
||||
model_output_i["dataset"] = self.dataset_name
|
||||
return model_output
|
||||
|
||||
def register_sampler(
|
||||
self,
|
||||
prediction_attr: str,
|
||||
gt_attr: Optional[str] = None,
|
||||
func: Optional[Callable[[Any], Any]] = None,
|
||||
):
|
||||
"""
|
||||
Register sampler for a field
|
||||
|
||||
Args:
|
||||
prediction_attr (str): field to replace with a sampled value
|
||||
gt_attr (Optional[str]): field to store the sampled value to, if not None
|
||||
func (Optional[Callable: Any -> Any]): sampler function
|
||||
"""
|
||||
self._samplers[prediction_attr] = _Sampler(src=prediction_attr, dst=gt_attr, func=func)
|
|
@ -0,0 +1,703 @@
|
|||
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
|
||||
import base64
|
||||
import numpy as np
|
||||
from io import BytesIO
|
||||
from typing import BinaryIO, Dict, List, Optional, Tuple, Union
|
||||
import torch
|
||||
from PIL import Image
|
||||
from torch.nn import functional as F
|
||||
|
||||
|
||||
class DensePoseTransformData(object):
|
||||
|
||||
# Horizontal symmetry label transforms used for horizontal flip
|
||||
MASK_LABEL_SYMMETRIES = [0, 1, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 14]
|
||||
# fmt: off
|
||||
POINT_LABEL_SYMMETRIES = [ 0, 1, 2, 4, 3, 6, 5, 8, 7, 10, 9, 12, 11, 14, 13, 16, 15, 18, 17, 20, 19, 22, 21, 24, 23] # noqa
|
||||
# fmt: on
|
||||
|
||||
def __init__(self, uv_symmetries: Dict[str, torch.Tensor], device: torch.device):
|
||||
self.mask_label_symmetries = DensePoseTransformData.MASK_LABEL_SYMMETRIES
|
||||
self.point_label_symmetries = DensePoseTransformData.POINT_LABEL_SYMMETRIES
|
||||
self.uv_symmetries = uv_symmetries
|
||||
self.device = torch.device("cpu")
|
||||
|
||||
def to(self, device: torch.device, copy: bool = False) -> "DensePoseTransformData":
|
||||
"""
|
||||
Convert transform data to the specified device
|
||||
|
||||
Args:
|
||||
device (torch.device): device to convert the data to
|
||||
copy (bool): flag that specifies whether to copy or to reference the data
|
||||
in case the device is the same
|
||||
Return:
|
||||
An instance of `DensePoseTransformData` with data stored on the specified device
|
||||
"""
|
||||
if self.device == device and not copy:
|
||||
return self
|
||||
uv_symmetry_map = {}
|
||||
for key in self.uv_symmetries:
|
||||
uv_symmetry_map[key] = self.uv_symmetries[key].to(device=device, copy=copy)
|
||||
return DensePoseTransformData(uv_symmetry_map, device)
|
||||
|
||||
@staticmethod
|
||||
def load(io: Union[str, BinaryIO]):
|
||||
"""
|
||||
Args:
|
||||
io: (str or binary file-like object): input file to load data from
|
||||
Returns:
|
||||
An instance of `DensePoseTransformData` with transforms loaded from the file
|
||||
"""
|
||||
import scipy.io
|
||||
|
||||
uv_symmetry_map = scipy.io.loadmat(io)
|
||||
uv_symmetry_map_torch = {}
|
||||
for key in ["U_transforms", "V_transforms"]:
|
||||
uv_symmetry_map_torch[key] = []
|
||||
map_src = uv_symmetry_map[key]
|
||||
map_dst = uv_symmetry_map_torch[key]
|
||||
for i in range(map_src.shape[1]):
|
||||
map_dst.append(torch.from_numpy(map_src[0, i]).to(dtype=torch.float))
|
||||
uv_symmetry_map_torch[key] = torch.stack(map_dst, dim=0)
|
||||
transform_data = DensePoseTransformData(uv_symmetry_map_torch, device=torch.device("cpu"))
|
||||
return transform_data
|
||||
|
||||
|
||||
class DensePoseDataRelative(object):
|
||||
"""
|
||||
Dense pose relative annotations that can be applied to any bounding box:
|
||||
x - normalized X coordinates [0, 255] of annotated points
|
||||
y - normalized Y coordinates [0, 255] of annotated points
|
||||
i - body part labels 0,...,24 for annotated points
|
||||
u - body part U coordinates [0, 1] for annotated points
|
||||
v - body part V coordinates [0, 1] for annotated points
|
||||
segm - 256x256 segmentation mask with values 0,...,14
|
||||
To obtain absolute x and y data wrt some bounding box one needs to first
|
||||
divide the data by 256, multiply by the respective bounding box size
|
||||
and add bounding box offset:
|
||||
x_img = x0 + x_norm * w / 256.0
|
||||
y_img = y0 + y_norm * h / 256.0
|
||||
Segmentation masks are typically sampled to get image-based masks.
|
||||
"""
|
||||
|
||||
# Key for normalized X coordinates in annotation dict
|
||||
X_KEY = "dp_x"
|
||||
# Key for normalized Y coordinates in annotation dict
|
||||
Y_KEY = "dp_y"
|
||||
# Key for U part coordinates in annotation dict
|
||||
U_KEY = "dp_U"
|
||||
# Key for V part coordinates in annotation dict
|
||||
V_KEY = "dp_V"
|
||||
# Key for I point labels in annotation dict
|
||||
I_KEY = "dp_I"
|
||||
# Key for segmentation mask in annotation dict
|
||||
S_KEY = "dp_masks"
|
||||
# Number of body parts in segmentation masks
|
||||
N_BODY_PARTS = 14
|
||||
# Number of parts in point labels
|
||||
N_PART_LABELS = 24
|
||||
MASK_SIZE = 256
|
||||
|
||||
def __init__(self, annotation, cleanup=False):
|
||||
is_valid, reason_not_valid = DensePoseDataRelative.validate_annotation(annotation)
|
||||
assert is_valid, "Invalid DensePose annotations: {}".format(reason_not_valid)
|
||||
self.x = torch.as_tensor(annotation[DensePoseDataRelative.X_KEY])
|
||||
self.y = torch.as_tensor(annotation[DensePoseDataRelative.Y_KEY])
|
||||
self.i = torch.as_tensor(annotation[DensePoseDataRelative.I_KEY])
|
||||
self.u = torch.as_tensor(annotation[DensePoseDataRelative.U_KEY])
|
||||
self.v = torch.as_tensor(annotation[DensePoseDataRelative.V_KEY])
|
||||
self.segm = DensePoseDataRelative.extract_segmentation_mask(annotation)
|
||||
self.device = torch.device("cpu")
|
||||
if cleanup:
|
||||
DensePoseDataRelative.cleanup_annotation(annotation)
|
||||
|
||||
def to(self, device):
|
||||
if self.device == device:
|
||||
return self
|
||||
new_data = DensePoseDataRelative.__new__(DensePoseDataRelative)
|
||||
new_data.x = self.x
|
||||
new_data.x = self.x.to(device)
|
||||
new_data.y = self.y.to(device)
|
||||
new_data.i = self.i.to(device)
|
||||
new_data.u = self.u.to(device)
|
||||
new_data.v = self.v.to(device)
|
||||
new_data.segm = self.segm.to(device)
|
||||
new_data.device = device
|
||||
return new_data
|
||||
|
||||
@staticmethod
|
||||
def extract_segmentation_mask(annotation):
|
||||
poly_specs = annotation[DensePoseDataRelative.S_KEY]
|
||||
if isinstance(poly_specs, torch.Tensor):
|
||||
# data is already given as mask tensors, no need to decode
|
||||
return poly_specs
|
||||
|
||||
import pycocotools.mask as mask_utils
|
||||
|
||||
segm = torch.zeros((DensePoseDataRelative.MASK_SIZE,) * 2, dtype=torch.float32)
|
||||
for i in range(DensePoseDataRelative.N_BODY_PARTS):
|
||||
poly_i = poly_specs[i]
|
||||
if poly_i:
|
||||
mask_i = mask_utils.decode(poly_i)
|
||||
segm[mask_i > 0] = i + 1
|
||||
return segm
|
||||
|
||||
@staticmethod
|
||||
def validate_annotation(annotation):
|
||||
for key in [
|
||||
DensePoseDataRelative.X_KEY,
|
||||
DensePoseDataRelative.Y_KEY,
|
||||
DensePoseDataRelative.I_KEY,
|
||||
DensePoseDataRelative.U_KEY,
|
||||
DensePoseDataRelative.V_KEY,
|
||||
DensePoseDataRelative.S_KEY,
|
||||
]:
|
||||
if key not in annotation:
|
||||
return False, "no {key} data in the annotation".format(key=key)
|
||||
return True, None
|
||||
|
||||
@staticmethod
|
||||
def cleanup_annotation(annotation):
|
||||
for key in [
|
||||
DensePoseDataRelative.X_KEY,
|
||||
DensePoseDataRelative.Y_KEY,
|
||||
DensePoseDataRelative.I_KEY,
|
||||
DensePoseDataRelative.U_KEY,
|
||||
DensePoseDataRelative.V_KEY,
|
||||
DensePoseDataRelative.S_KEY,
|
||||
]:
|
||||
if key in annotation:
|
||||
del annotation[key]
|
||||
|
||||
def apply_transform(self, transforms, densepose_transform_data):
|
||||
self._transform_pts(transforms, densepose_transform_data)
|
||||
self._transform_segm(transforms, densepose_transform_data)
|
||||
|
||||
def _transform_pts(self, transforms, dp_transform_data):
|
||||
import detectron2.data.transforms as T
|
||||
|
||||
# NOTE: This assumes that HorizFlipTransform is the only one that does flip
|
||||
do_hflip = sum(isinstance(t, T.HFlipTransform) for t in transforms.transforms) % 2 == 1
|
||||
if do_hflip:
|
||||
self.x = self.segm.size(1) - self.x
|
||||
self._flip_iuv_semantics(dp_transform_data)
|
||||
|
||||
for t in transforms.transforms:
|
||||
if isinstance(t, T.RotationTransform):
|
||||
xy_scale = np.array((t.w, t.h)) / DensePoseDataRelative.MASK_SIZE
|
||||
xy = t.apply_coords(np.stack((self.x, self.y), axis=1) * xy_scale)
|
||||
self.x, self.y = torch.tensor(xy / xy_scale, dtype=self.x.dtype).T
|
||||
|
||||
def _flip_iuv_semantics(self, dp_transform_data: DensePoseTransformData) -> None:
|
||||
i_old = self.i.clone()
|
||||
uv_symmetries = dp_transform_data.uv_symmetries
|
||||
pt_label_symmetries = dp_transform_data.point_label_symmetries
|
||||
for i in range(self.N_PART_LABELS):
|
||||
if i + 1 in i_old:
|
||||
annot_indices_i = i_old == i + 1
|
||||
if pt_label_symmetries[i + 1] != i + 1:
|
||||
self.i[annot_indices_i] = pt_label_symmetries[i + 1]
|
||||
u_loc = (self.u[annot_indices_i] * 255).long()
|
||||
v_loc = (self.v[annot_indices_i] * 255).long()
|
||||
self.u[annot_indices_i] = uv_symmetries["U_transforms"][i][v_loc, u_loc].to(
|
||||
device=self.u.device
|
||||
)
|
||||
self.v[annot_indices_i] = uv_symmetries["V_transforms"][i][v_loc, u_loc].to(
|
||||
device=self.v.device
|
||||
)
|
||||
|
||||
def _transform_segm(self, transforms, dp_transform_data):
|
||||
import detectron2.data.transforms as T
|
||||
|
||||
# NOTE: This assumes that HorizFlipTransform is the only one that does flip
|
||||
do_hflip = sum(isinstance(t, T.HFlipTransform) for t in transforms.transforms) % 2 == 1
|
||||
if do_hflip:
|
||||
self.segm = torch.flip(self.segm, [1])
|
||||
self._flip_segm_semantics(dp_transform_data)
|
||||
|
||||
for t in transforms.transforms:
|
||||
if isinstance(t, T.RotationTransform):
|
||||
self._transform_segm_rotation(t)
|
||||
|
||||
def _flip_segm_semantics(self, dp_transform_data):
|
||||
old_segm = self.segm.clone()
|
||||
mask_label_symmetries = dp_transform_data.mask_label_symmetries
|
||||
for i in range(self.N_BODY_PARTS):
|
||||
if mask_label_symmetries[i + 1] != i + 1:
|
||||
self.segm[old_segm == i + 1] = mask_label_symmetries[i + 1]
|
||||
|
||||
def _transform_segm_rotation(self, rotation):
|
||||
self.segm = F.interpolate(self.segm[None, None, :], (rotation.h, rotation.w)).numpy()
|
||||
self.segm = torch.tensor(rotation.apply_segmentation(self.segm[0, 0]))[None, None, :]
|
||||
self.segm = F.interpolate(self.segm, [DensePoseDataRelative.MASK_SIZE] * 2)[0, 0]
|
||||
|
||||
|
||||
def normalized_coords_transform(x0, y0, w, h):
|
||||
"""
|
||||
Coordinates transform that maps top left corner to (-1, -1) and bottom
|
||||
right corner to (1, 1). Used for torch.grid_sample to initialize the
|
||||
grid
|
||||
"""
|
||||
|
||||
def f(p):
|
||||
return (2 * (p[0] - x0) / w - 1, 2 * (p[1] - y0) / h - 1)
|
||||
|
||||
return f
|
||||
|
||||
|
||||
class DensePoseOutput(object):
|
||||
def __init__(self, S, I, U, V, confidences):
|
||||
"""
|
||||
Args:
|
||||
S (`torch.Tensor`): coarse segmentation tensor of size (N, A, H, W)
|
||||
I (`torch.Tensor`): fine segmentation tensor of size (N, C, H, W)
|
||||
U (`torch.Tensor`): U coordinates for each fine segmentation label of size (N, C, H, W)
|
||||
V (`torch.Tensor`): V coordinates for each fine segmentation label of size (N, C, H, W)
|
||||
confidences (dict of str -> `torch.Tensor`) estimated confidence model parameters
|
||||
"""
|
||||
self.S = S
|
||||
self.I = I # noqa: E741
|
||||
self.U = U
|
||||
self.V = V
|
||||
self.confidences = confidences
|
||||
self._check_output_dims(S, I, U, V)
|
||||
|
||||
def _check_output_dims(self, S, I, U, V):
|
||||
assert (
|
||||
len(S.size()) == 4
|
||||
), "Segmentation output should have 4 " "dimensions (NCHW), but has size {}".format(
|
||||
S.size()
|
||||
)
|
||||
assert (
|
||||
len(I.size()) == 4
|
||||
), "Segmentation output should have 4 " "dimensions (NCHW), but has size {}".format(
|
||||
S.size()
|
||||
)
|
||||
assert (
|
||||
len(U.size()) == 4
|
||||
), "Segmentation output should have 4 " "dimensions (NCHW), but has size {}".format(
|
||||
S.size()
|
||||
)
|
||||
assert (
|
||||
len(V.size()) == 4
|
||||
), "Segmentation output should have 4 " "dimensions (NCHW), but has size {}".format(
|
||||
S.size()
|
||||
)
|
||||
assert len(S) == len(I), (
|
||||
"Number of output segmentation planes {} "
|
||||
"should be equal to the number of output part index "
|
||||
"planes {}".format(len(S), len(I))
|
||||
)
|
||||
assert S.size()[2:] == I.size()[2:], (
|
||||
"Output segmentation plane size {} "
|
||||
"should be equal to the output part index "
|
||||
"plane size {}".format(S.size()[2:], I.size()[2:])
|
||||
)
|
||||
assert I.size() == U.size(), (
|
||||
"Part index output shape {} "
|
||||
"should be the same as U coordinates output shape {}".format(I.size(), U.size())
|
||||
)
|
||||
assert I.size() == V.size(), (
|
||||
"Part index output shape {} "
|
||||
"should be the same as V coordinates output shape {}".format(I.size(), V.size())
|
||||
)
|
||||
|
||||
def resize(self, image_size_hw):
|
||||
# do nothing - outputs are invariant to resize
|
||||
pass
|
||||
|
||||
def _crop(self, S, I, U, V, bbox_old_xywh, bbox_new_xywh):
|
||||
"""
|
||||
Resample S, I, U, V from bbox_old to the cropped bbox_new
|
||||
"""
|
||||
x0old, y0old, wold, hold = bbox_old_xywh
|
||||
x0new, y0new, wnew, hnew = bbox_new_xywh
|
||||
tr_coords = normalized_coords_transform(x0old, y0old, wold, hold)
|
||||
topleft = (x0new, y0new)
|
||||
bottomright = (x0new + wnew, y0new + hnew)
|
||||
topleft_norm = tr_coords(topleft)
|
||||
bottomright_norm = tr_coords(bottomright)
|
||||
hsize = S.size(1)
|
||||
wsize = S.size(2)
|
||||
grid = torch.meshgrid(
|
||||
torch.arange(
|
||||
topleft_norm[1],
|
||||
bottomright_norm[1],
|
||||
(bottomright_norm[1] - topleft_norm[1]) / hsize,
|
||||
)[:hsize],
|
||||
torch.arange(
|
||||
topleft_norm[0],
|
||||
bottomright_norm[0],
|
||||
(bottomright_norm[0] - topleft_norm[0]) / wsize,
|
||||
)[:wsize],
|
||||
)
|
||||
grid = torch.stack(grid, dim=2).to(S.device)
|
||||
assert (
|
||||
grid.size(0) == hsize
|
||||
), "Resampled grid expected " "height={}, actual height={}".format(hsize, grid.size(0))
|
||||
assert grid.size(1) == wsize, "Resampled grid expected " "width={}, actual width={}".format(
|
||||
wsize, grid.size(1)
|
||||
)
|
||||
S_new = F.grid_sample(
|
||||
S.unsqueeze(0),
|
||||
torch.unsqueeze(grid, 0),
|
||||
mode="bilinear",
|
||||
padding_mode="border",
|
||||
align_corners=True,
|
||||
).squeeze(0)
|
||||
I_new = F.grid_sample(
|
||||
I.unsqueeze(0),
|
||||
torch.unsqueeze(grid, 0),
|
||||
mode="bilinear",
|
||||
padding_mode="border",
|
||||
align_corners=True,
|
||||
).squeeze(0)
|
||||
U_new = F.grid_sample(
|
||||
U.unsqueeze(0),
|
||||
torch.unsqueeze(grid, 0),
|
||||
mode="bilinear",
|
||||
padding_mode="border",
|
||||
align_corners=True,
|
||||
).squeeze(0)
|
||||
V_new = F.grid_sample(
|
||||
V.unsqueeze(0),
|
||||
torch.unsqueeze(grid, 0),
|
||||
mode="bilinear",
|
||||
padding_mode="border",
|
||||
align_corners=True,
|
||||
).squeeze(0)
|
||||
return S_new, I_new, U_new, V_new
|
||||
|
||||
def crop(self, indices_cropped, bboxes_old, bboxes_new):
|
||||
"""
|
||||
Crop outputs for selected bounding boxes to the new bounding boxes.
|
||||
"""
|
||||
# VK: cropping is ignored for now
|
||||
# for i, ic in enumerate(indices_cropped):
|
||||
# self.S[ic], self.I[ic], self.U[ic], self.V[ic] = \
|
||||
# self._crop(self.S[ic], self.I[ic], self.U[ic], self.V[ic],
|
||||
# bboxes_old[i], bboxes_new[i])
|
||||
pass
|
||||
|
||||
def hflip(self, transform_data: DensePoseTransformData) -> None:
|
||||
"""
|
||||
Change S, I, U and V to take into account a Horizontal flip.
|
||||
"""
|
||||
if self.I.shape[0] > 0:
|
||||
for el in "SIUV":
|
||||
self.__dict__[el] = torch.flip(self.__dict__[el], [3])
|
||||
for key in self.confidences:
|
||||
self.confidences[key] = torch.flip(self.confidences[key], [3])
|
||||
self._flip_iuv_semantics_tensor(transform_data)
|
||||
self._flip_segm_semantics_tensor(transform_data)
|
||||
|
||||
def _flip_iuv_semantics_tensor(self, dp_transform_data: DensePoseTransformData) -> None:
|
||||
point_label_symmetries = dp_transform_data.point_label_symmetries
|
||||
uv_symmetries = dp_transform_data.uv_symmetries
|
||||
|
||||
N, C, H, W = self.U.shape
|
||||
u_loc = (self.U[:, 1:, :, :].clamp(0, 1) * 255).long()
|
||||
v_loc = (self.V[:, 1:, :, :].clamp(0, 1) * 255).long()
|
||||
Iindex = torch.arange(C - 1, device=self.U.device)[None, :, None, None].expand(
|
||||
N, C - 1, H, W
|
||||
)
|
||||
self.U[:, 1:, :, :] = uv_symmetries["U_transforms"][Iindex, v_loc, u_loc]
|
||||
self.V[:, 1:, :, :] = uv_symmetries["V_transforms"][Iindex, v_loc, u_loc]
|
||||
|
||||
for el in "IUV":
|
||||
self.__dict__[el] = self.__dict__[el][:, point_label_symmetries, :, :]
|
||||
|
||||
def _flip_segm_semantics_tensor(self, dp_transform_data):
|
||||
if self.S.shape[1] == DensePoseDataRelative.N_BODY_PARTS + 1:
|
||||
self.S = self.S[:, dp_transform_data.mask_label_symmetries, :, :]
|
||||
|
||||
def to_result(self, boxes_xywh):
|
||||
"""
|
||||
Convert DensePose outputs to results format. Results are more compact,
|
||||
but cannot be resampled any more
|
||||
"""
|
||||
result = DensePoseResult(boxes_xywh, self.S, self.I, self.U, self.V)
|
||||
return result
|
||||
|
||||
def __getitem__(self, item):
|
||||
if isinstance(item, int):
|
||||
S_selected = self.S[item].unsqueeze(0)
|
||||
I_selected = self.I[item].unsqueeze(0)
|
||||
U_selected = self.U[item].unsqueeze(0)
|
||||
V_selected = self.V[item].unsqueeze(0)
|
||||
conf_selected = {}
|
||||
for key in self.confidences:
|
||||
conf_selected[key] = self.confidences[key][item].unsqueeze(0)
|
||||
else:
|
||||
S_selected = self.S[item]
|
||||
I_selected = self.I[item]
|
||||
U_selected = self.U[item]
|
||||
V_selected = self.V[item]
|
||||
conf_selected = {}
|
||||
for key in self.confidences:
|
||||
conf_selected[key] = self.confidences[key][item]
|
||||
return DensePoseOutput(S_selected, I_selected, U_selected, V_selected, conf_selected)
|
||||
|
||||
def __str__(self):
|
||||
s = "DensePoseOutput S {}, I {}, U {}, V {}".format(
|
||||
list(self.S.size()), list(self.I.size()), list(self.U.size()), list(self.V.size())
|
||||
)
|
||||
s_conf = "confidences: [{}]".format(
|
||||
", ".join([f"{key} {list(self.confidences[key].size())}" for key in self.confidences])
|
||||
)
|
||||
return ", ".join([s, s_conf])
|
||||
|
||||
def __len__(self):
|
||||
return self.S.size(0)
|
||||
|
||||
|
||||
def resample_output_to_bbox(
|
||||
output: DensePoseOutput, bbox_xywh_abs: List[int], confidences: Optional[List[str]] = None
|
||||
) -> Tuple[torch.Tensor, torch.Tensor]:
|
||||
"""
|
||||
Convert DensePose output of size [1, C, S, S] into DensePose results [D, H_i, W_i],
|
||||
where `i` is detection index and `D == 2 + len(confidences)`. This conversion:
|
||||
- resamples data to the detection bounding box size (H_i, W_i),
|
||||
- sets label for each pixel of the bounding box as the `argmax` of scores,
|
||||
- assigns values (U, V, confidences) based on label and resampled data
|
||||
|
||||
Args:
|
||||
output (DensePoseOutput): outputs of the DensePose model
|
||||
bbox_xywh_abs (List[int]): bounding box, a list of 4 integer values XYWH
|
||||
confidences (List[str]): optional list of `str` that specifies confidence
|
||||
channels to be resampled and added to the results
|
||||
|
||||
Results:
|
||||
labels (torch.Tensor): tensor [1, H_i, W_i] of `torch.uint8` containing fine
|
||||
segmentation labels of each pixel
|
||||
data (torch.Tensor): tensor [D, H_i, W_i] of `torch.float32` containing
|
||||
for each pixel the estimated U, V coordinates and the requested
|
||||
confidence values in the order that corresponds to `confidences`
|
||||
"""
|
||||
x, y, w, h = bbox_xywh_abs
|
||||
w = max(int(w), 1)
|
||||
h = max(int(h), 1)
|
||||
N_out = 2 if confidences is None else 2 + len(confidences)
|
||||
device = output.U.device
|
||||
data = torch.zeros([N_out, h, w], dtype=torch.float32, device=device)
|
||||
# coarse segmentation
|
||||
assert (
|
||||
len(output.S.size()) == 4
|
||||
), "AnnIndex tensor size should have {} dimensions but has {}".format(4, len(output.S.size()))
|
||||
s_bbox = F.interpolate(output.S, (h, w), mode="bilinear", align_corners=False).argmax(dim=1)
|
||||
# fine segmentation
|
||||
assert (
|
||||
len(output.I.size()) == 4
|
||||
), "IndexUV tensor size should have {} dimensions but has {}".format(4, len(output.S.size()))
|
||||
labels = (
|
||||
F.interpolate(output.I, (h, w), mode="bilinear", align_corners=False).argmax(dim=1)
|
||||
* (s_bbox > 0).long()
|
||||
).squeeze(0)
|
||||
# U
|
||||
assert len(output.U.size()) == 4, "U tensor size should have {} dimensions but has {}".format(
|
||||
4, len(output.U.size())
|
||||
)
|
||||
u_bbox = F.interpolate(output.U, (h, w), mode="bilinear", align_corners=False)
|
||||
# V
|
||||
assert len(output.V.size()) == 4, "V tensor size should have {} dimensions but has {}".format(
|
||||
4, len(output.V.size())
|
||||
)
|
||||
v_bbox = F.interpolate(output.V, (h, w), mode="bilinear", align_corners=False)
|
||||
# confidences
|
||||
if confidences is not None:
|
||||
resampled_confidence = {}
|
||||
for key in output.confidences:
|
||||
resampled_confidence[key] = F.interpolate(
|
||||
output.confidences[key], (h, w), mode="bilinear", align_corners=False
|
||||
)
|
||||
|
||||
# assign data from channels that correspond to the labels
|
||||
for part_id in range(1, u_bbox.size(1)):
|
||||
data[0][labels == part_id] = u_bbox[0, part_id][labels == part_id]
|
||||
data[1][labels == part_id] = v_bbox[0, part_id][labels == part_id]
|
||||
if confidences is None:
|
||||
continue
|
||||
for i, key in enumerate(confidences):
|
||||
if resampled_confidence[key].size(1) != u_bbox.size(1):
|
||||
# confidence is not part-based, don't try to fill it part by part
|
||||
continue
|
||||
data[2 + i][labels == part_id] = resampled_confidence[key][0, part_id][
|
||||
labels == part_id
|
||||
]
|
||||
if confidences is not None:
|
||||
for i, key in enumerate(confidences):
|
||||
if resampled_confidence[key].size(1) != u_bbox.size(1):
|
||||
# confidence is not part-based, fill the data with the first channel
|
||||
# (targeted for segmentation confidences that have only 1 channel)
|
||||
data[2 + i] = resampled_confidence[key][0, 0]
|
||||
return labels.unsqueeze(0), data
|
||||
|
||||
|
||||
class DensePoseResult(object):
|
||||
def __init__(self, boxes_xywh, S, I, U, V):
|
||||
self.results = []
|
||||
self.boxes_xywh = boxes_xywh.cpu().tolist()
|
||||
assert len(boxes_xywh.size()) == 2
|
||||
assert boxes_xywh.size(1) == 4
|
||||
for i, box_xywh in enumerate(boxes_xywh):
|
||||
result_i = self._output_to_result(box_xywh, S[[i]], I[[i]], U[[i]], V[[i]])
|
||||
result_numpy_i = result_i.cpu().numpy()
|
||||
result_encoded_i = DensePoseResult.encode_png_data(result_numpy_i)
|
||||
result_encoded_with_shape_i = (result_numpy_i.shape, result_encoded_i)
|
||||
self.results.append(result_encoded_with_shape_i)
|
||||
|
||||
def __str__(self):
|
||||
s = "DensePoseResult: N={} [{}]".format(
|
||||
len(self.results), ", ".join([str(list(r[0])) for r in self.results])
|
||||
)
|
||||
return s
|
||||
|
||||
def _output_to_result(self, box_xywh, S, I, U, V):
|
||||
# TODO: reuse resample_output_to_bbox
|
||||
x, y, w, h = box_xywh
|
||||
w = max(int(w), 1)
|
||||
h = max(int(h), 1)
|
||||
result = torch.zeros([3, h, w], dtype=torch.uint8, device=U.device)
|
||||
assert (
|
||||
len(S.size()) == 4
|
||||
), "AnnIndex tensor size should have {} " "dimensions but has {}".format(4, len(S.size()))
|
||||
s_bbox = F.interpolate(S, (h, w), mode="bilinear", align_corners=False).argmax(dim=1)
|
||||
assert (
|
||||
len(I.size()) == 4
|
||||
), "IndexUV tensor size should have {} " "dimensions but has {}".format(4, len(S.size()))
|
||||
i_bbox = (
|
||||
F.interpolate(I, (h, w), mode="bilinear", align_corners=False).argmax(dim=1)
|
||||
* (s_bbox > 0).long()
|
||||
).squeeze(0)
|
||||
assert len(U.size()) == 4, "U tensor size should have {} " "dimensions but has {}".format(
|
||||
4, len(U.size())
|
||||
)
|
||||
u_bbox = F.interpolate(U, (h, w), mode="bilinear", align_corners=False)
|
||||
assert len(V.size()) == 4, "V tensor size should have {} " "dimensions but has {}".format(
|
||||
4, len(V.size())
|
||||
)
|
||||
v_bbox = F.interpolate(V, (h, w), mode="bilinear", align_corners=False)
|
||||
result[0] = i_bbox
|
||||
for part_id in range(1, u_bbox.size(1)):
|
||||
result[1][i_bbox == part_id] = (
|
||||
(u_bbox[0, part_id][i_bbox == part_id] * 255).clamp(0, 255).to(torch.uint8)
|
||||
)
|
||||
result[2][i_bbox == part_id] = (
|
||||
(v_bbox[0, part_id][i_bbox == part_id] * 255).clamp(0, 255).to(torch.uint8)
|
||||
)
|
||||
assert (
|
||||
result.size(1) == h
|
||||
), "Results height {} should be equal" "to bounding box height {}".format(result.size(1), h)
|
||||
assert (
|
||||
result.size(2) == w
|
||||
), "Results width {} should be equal" "to bounding box width {}".format(result.size(2), w)
|
||||
return result
|
||||
|
||||
@staticmethod
|
||||
def encode_png_data(arr):
|
||||
"""
|
||||
Encode array data as a PNG image using the highest compression rate
|
||||
@param arr [in] Data stored in an array of size (3, M, N) of type uint8
|
||||
@return Base64-encoded string containing PNG-compressed data
|
||||
"""
|
||||
assert len(arr.shape) == 3, "Expected a 3D array as an input," " got a {0}D array".format(
|
||||
len(arr.shape)
|
||||
)
|
||||
assert arr.shape[0] == 3, "Expected first array dimension of size 3," " got {0}".format(
|
||||
arr.shape[0]
|
||||
)
|
||||
assert arr.dtype == np.uint8, "Expected an array of type np.uint8, " " got {0}".format(
|
||||
arr.dtype
|
||||
)
|
||||
data = np.moveaxis(arr, 0, -1)
|
||||
im = Image.fromarray(data)
|
||||
fstream = BytesIO()
|
||||
im.save(fstream, format="png", optimize=True)
|
||||
s = base64.encodebytes(fstream.getvalue()).decode()
|
||||
return s
|
||||
|
||||
@staticmethod
|
||||
def decode_png_data(shape, s):
|
||||
"""
|
||||
Decode array data from a string that contains PNG-compressed data
|
||||
@param Base64-encoded string containing PNG-compressed data
|
||||
@return Data stored in an array of size (3, M, N) of type uint8
|
||||
"""
|
||||
fstream = BytesIO(base64.decodebytes(s.encode()))
|
||||
im = Image.open(fstream)
|
||||
data = np.moveaxis(np.array(im.getdata(), dtype=np.uint8), -1, 0)
|
||||
return data.reshape(shape)
|
||||
|
||||
def __len__(self):
|
||||
return len(self.results)
|
||||
|
||||
def __getitem__(self, item):
|
||||
result_encoded = self.results[item]
|
||||
bbox_xywh = self.boxes_xywh[item]
|
||||
return result_encoded, bbox_xywh
|
||||
|
||||
|
||||
class DensePoseList(object):
|
||||
|
||||
_TORCH_DEVICE_CPU = torch.device("cpu")
|
||||
|
||||
def __init__(self, densepose_datas, boxes_xyxy_abs, image_size_hw, device=_TORCH_DEVICE_CPU):
|
||||
assert len(densepose_datas) == len(
|
||||
boxes_xyxy_abs
|
||||
), "Attempt to initialize DensePoseList with {} DensePose datas " "and {} boxes".format(
|
||||
len(densepose_datas), len(boxes_xyxy_abs)
|
||||
)
|
||||
self.densepose_datas = []
|
||||
for densepose_data in densepose_datas:
|
||||
assert isinstance(densepose_data, DensePoseDataRelative) or densepose_data is None, (
|
||||
"Attempt to initialize DensePoseList with DensePose datas "
|
||||
"of type {}, expected DensePoseDataRelative".format(type(densepose_data))
|
||||
)
|
||||
densepose_data_ondevice = (
|
||||
densepose_data.to(device) if densepose_data is not None else None
|
||||
)
|
||||
self.densepose_datas.append(densepose_data_ondevice)
|
||||
self.boxes_xyxy_abs = boxes_xyxy_abs.to(device)
|
||||
self.image_size_hw = image_size_hw
|
||||
self.device = device
|
||||
|
||||
def to(self, device):
|
||||
if self.device == device:
|
||||
return self
|
||||
return DensePoseList(self.densepose_datas, self.boxes_xyxy_abs, self.image_size_hw, device)
|
||||
|
||||
def __iter__(self):
|
||||
return iter(self.densepose_datas)
|
||||
|
||||
def __len__(self):
|
||||
return len(self.densepose_datas)
|
||||
|
||||
def __repr__(self):
|
||||
s = self.__class__.__name__ + "("
|
||||
s += "num_instances={}, ".format(len(self.densepose_datas))
|
||||
s += "image_width={}, ".format(self.image_size_hw[1])
|
||||
s += "image_height={})".format(self.image_size_hw[0])
|
||||
return s
|
||||
|
||||
def __getitem__(self, item):
|
||||
if isinstance(item, int):
|
||||
densepose_data_rel = self.densepose_datas[item]
|
||||
return densepose_data_rel
|
||||
elif isinstance(item, slice):
|
||||
densepose_datas_rel = self.densepose_datas[item]
|
||||
boxes_xyxy_abs = self.boxes_xyxy_abs[item]
|
||||
return DensePoseList(
|
||||
densepose_datas_rel, boxes_xyxy_abs, self.image_size_hw, self.device
|
||||
)
|
||||
elif isinstance(item, torch.Tensor) and (item.dtype == torch.bool):
|
||||
densepose_datas_rel = [self.densepose_datas[i] for i, x in enumerate(item) if x > 0]
|
||||
boxes_xyxy_abs = self.boxes_xyxy_abs[item]
|
||||
return DensePoseList(
|
||||
densepose_datas_rel, boxes_xyxy_abs, self.image_size_hw, self.device
|
||||
)
|
||||
else:
|
||||
densepose_datas_rel = [self.densepose_datas[i] for i in item]
|
||||
boxes_xyxy_abs = self.boxes_xyxy_abs[item]
|
||||
return DensePoseList(
|
||||
densepose_datas_rel, boxes_xyxy_abs, self.image_size_hw, self.device
|
||||
)
|
|
@ -0,0 +1,3 @@
|
|||
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
|
||||
|
||||
from .image import ImageResizeTransform
|
|
@ -0,0 +1,37 @@
|
|||
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
|
||||
|
||||
import torch
|
||||
|
||||
|
||||
class ImageResizeTransform:
|
||||
"""
|
||||
Transform that converts frames loaded from a dataset
|
||||
(RGB data in NHWC channel order, typically uint8) to a format ready to be
|
||||
consumed by DensePose training (BGR float32 data in NCHW channel order)
|
||||
"""
|
||||
|
||||
def __init__(self, min_size: int = 800, max_size: int = 1333):
|
||||
self.min_size = min_size
|
||||
self.max_size = max_size
|
||||
|
||||
def __call__(self, frames: torch.Tensor) -> torch.Tensor:
|
||||
"""
|
||||
Args:
|
||||
frames (torch.Tensor): tensor of size [N, H, W, 3] that contains
|
||||
RGB data (typically in uint8)
|
||||
Returns:
|
||||
frames (torch.Tensor): tensor of size [N, 3, H1, W1] where
|
||||
H1 and W1 are chosen to respect the specified min and max sizes
|
||||
and preserve the original aspect ratio, the data channels
|
||||
follow BGR order and the data type is `torch.float32`
|
||||
"""
|
||||
frames = frames[..., [2, 1, 0]] # RGB -> BGR
|
||||
frames = frames.permute(0, 3, 1, 2).float() # NHWC -> NCHW
|
||||
# resize with min size
|
||||
min_size = min(frames.shape[-2:])
|
||||
max_size = max(frames.shape[-2:])
|
||||
scale = min(self.min_size / min_size, self.max_size / max_size)
|
||||
frames = torch.nn.functional.interpolate(
|
||||
frames, scale_factor=scale, mode="bilinear", align_corners=False
|
||||
)
|
||||
return frames
|
|
@ -0,0 +1,22 @@
|
|||
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
|
||||
|
||||
import os
|
||||
from typing import Optional
|
||||
|
||||
|
||||
def is_relative_local_path(path: os.PathLike):
|
||||
path_str = os.fsdecode(path)
|
||||
return ("://" not in path_str) and not os.path.isabs(path)
|
||||
|
||||
|
||||
def maybe_prepend_base_path(base_path: Optional[os.PathLike], path: os.PathLike):
|
||||
"""
|
||||
Prepends the provided path with a base path prefix if:
|
||||
1) base path is not None;
|
||||
2) path is a local path
|
||||
"""
|
||||
if base_path is None:
|
||||
return path
|
||||
if is_relative_local_path(path):
|
||||
return os.path.join(base_path, path)
|
||||
return path
|
|
@ -0,0 +1,17 @@
|
|||
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
|
||||
|
||||
from .frame_selector import (
|
||||
FrameSelectionStrategy,
|
||||
RandomKFramesSelector,
|
||||
FirstKFramesSelector,
|
||||
LastKFramesSelector,
|
||||
FrameTsList,
|
||||
FrameSelector,
|
||||
)
|
||||
|
||||
from .video_keyframe_dataset import (
|
||||
VideoKeyframeDataset,
|
||||
video_list_from_file,
|
||||
list_keyframes,
|
||||
read_keyframes,
|
||||
)
|
|
@ -0,0 +1,87 @@
|
|||
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
|
||||
|
||||
import random
|
||||
from collections.abc import Callable
|
||||
from enum import Enum
|
||||
from typing import Callable as TCallable
|
||||
from typing import List
|
||||
|
||||
FrameTsList = List[int]
|
||||
FrameSelector = TCallable[[FrameTsList], FrameTsList]
|
||||
|
||||
|
||||
class FrameSelectionStrategy(Enum):
|
||||
"""
|
||||
Frame selection strategy used with videos:
|
||||
- "random_k": select k random frames
|
||||
- "first_k": select k first frames
|
||||
- "last_k": select k last frames
|
||||
- "all": select all frames
|
||||
"""
|
||||
|
||||
# fmt: off
|
||||
RANDOM_K = "random_k"
|
||||
FIRST_K = "first_k"
|
||||
LAST_K = "last_k"
|
||||
ALL = "all"
|
||||
# fmt: on
|
||||
|
||||
|
||||
class RandomKFramesSelector(Callable):
|
||||
"""
|
||||
Selector that retains at most `k` random frames
|
||||
"""
|
||||
|
||||
def __init__(self, k: int):
|
||||
self.k = k
|
||||
|
||||
def __call__(self, frame_tss: FrameTsList) -> FrameTsList:
|
||||
"""
|
||||
Select `k` random frames
|
||||
|
||||
Args:
|
||||
frames_tss (List[int]): timestamps of input frames
|
||||
Returns:
|
||||
List[int]: timestamps of selected frames
|
||||
"""
|
||||
return random.sample(frame_tss, min(self.k, len(frame_tss)))
|
||||
|
||||
|
||||
class FirstKFramesSelector(Callable):
|
||||
"""
|
||||
Selector that retains at most `k` first frames
|
||||
"""
|
||||
|
||||
def __init__(self, k: int):
|
||||
self.k = k
|
||||
|
||||
def __call__(self, frame_tss: FrameTsList) -> FrameTsList:
|
||||
"""
|
||||
Select `k` first frames
|
||||
|
||||
Args:
|
||||
frames_tss (List[int]): timestamps of input frames
|
||||
Returns:
|
||||
List[int]: timestamps of selected frames
|
||||
"""
|
||||
return frame_tss[: self.k]
|
||||
|
||||
|
||||
class LastKFramesSelector(Callable):
|
||||
"""
|
||||
Selector that retains at most `k` last frames from video data
|
||||
"""
|
||||
|
||||
def __init__(self, k: int):
|
||||
self.k = k
|
||||
|
||||
def __call__(self, frame_tss: FrameTsList) -> FrameTsList:
|
||||
"""
|
||||
Select `k` last frames
|
||||
|
||||
Args:
|
||||
frames_tss (List[int]): timestamps of input frames
|
||||
Returns:
|
||||
List[int]: timestamps of selected frames
|
||||
"""
|
||||
return frame_tss[-self.k :]
|
|
@ -0,0 +1,232 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
|
||||
|
||||
import logging
|
||||
import numpy as np
|
||||
from typing import Callable, List, Optional
|
||||
import torch
|
||||
from fvcore.common.file_io import PathManager
|
||||
from torch.utils.data.dataset import Dataset
|
||||
|
||||
import av
|
||||
|
||||
from ..utils import maybe_prepend_base_path
|
||||
from .frame_selector import FrameSelector, FrameTsList
|
||||
|
||||
FrameList = List[av.frame.Frame]
|
||||
FrameTransform = Callable[[torch.Tensor], torch.Tensor]
|
||||
|
||||
|
||||
def list_keyframes(video_fpath: str, video_stream_idx: int = 0) -> FrameTsList:
|
||||
"""
|
||||
Traverses all keyframes of a video file. Returns a list of keyframe
|
||||
timestamps. Timestamps are counts in timebase units.
|
||||
|
||||
Args:
|
||||
video_fpath (str): Video file path
|
||||
video_stream_idx (int): Video stream index (default: 0)
|
||||
Returns:
|
||||
List[int]: list of keyframe timestaps (timestamp is a count in timebase
|
||||
units)
|
||||
"""
|
||||
try:
|
||||
with PathManager.open(video_fpath, "rb") as io:
|
||||
container = av.open(io, mode="r")
|
||||
stream = container.streams.video[video_stream_idx]
|
||||
keyframes = []
|
||||
pts = -1
|
||||
# Note: even though we request forward seeks for keyframes, sometimes
|
||||
# a keyframe in backwards direction is returned. We introduce tolerance
|
||||
# as a max count of ignored backward seeks
|
||||
tolerance_backward_seeks = 2
|
||||
while True:
|
||||
try:
|
||||
container.seek(pts + 1, backward=False, any_frame=False, stream=stream)
|
||||
except av.AVError as e:
|
||||
# the exception occurs when the video length is exceeded,
|
||||
# we then return whatever data we've already collected
|
||||
logger = logging.getLogger(__name__)
|
||||
logger.debug(
|
||||
f"List keyframes: Error seeking video file {video_fpath}, "
|
||||
f"video stream {video_stream_idx}, pts {pts + 1}, AV error: {e}"
|
||||
)
|
||||
return keyframes
|
||||
except OSError as e:
|
||||
logger = logging.getLogger(__name__)
|
||||
logger.warning(
|
||||
f"List keyframes: Error seeking video file {video_fpath}, "
|
||||
f"video stream {video_stream_idx}, pts {pts + 1}, OS error: {e}"
|
||||
)
|
||||
return []
|
||||
packet = next(container.demux(video=video_stream_idx))
|
||||
if packet.pts is not None and packet.pts <= pts:
|
||||
logger = logging.getLogger(__name__)
|
||||
logger.warning(
|
||||
f"Video file {video_fpath}, stream {video_stream_idx}: "
|
||||
f"bad seek for packet {pts + 1} (got packet {packet.pts}), "
|
||||
f"tolerance {tolerance_backward_seeks}."
|
||||
)
|
||||
tolerance_backward_seeks -= 1
|
||||
if tolerance_backward_seeks == 0:
|
||||
return []
|
||||
pts += 1
|
||||
continue
|
||||
tolerance_backward_seeks = 2
|
||||
pts = packet.pts
|
||||
if pts is None:
|
||||
return keyframes
|
||||
if packet.is_keyframe:
|
||||
keyframes.append(pts)
|
||||
return keyframes
|
||||
except OSError as e:
|
||||
logger = logging.getLogger(__name__)
|
||||
logger.warning(
|
||||
f"List keyframes: Error opening video file container {video_fpath}, " f"OS error: {e}"
|
||||
)
|
||||
except RuntimeError as e:
|
||||
logger = logging.getLogger(__name__)
|
||||
logger.warning(
|
||||
f"List keyframes: Error opening video file container {video_fpath}, "
|
||||
f"Runtime error: {e}"
|
||||
)
|
||||
return []
|
||||
|
||||
|
||||
def read_keyframes(
|
||||
video_fpath: str, keyframes: FrameTsList, video_stream_idx: int = 0
|
||||
) -> FrameList:
|
||||
"""
|
||||
Reads keyframe data from a video file.
|
||||
|
||||
Args:
|
||||
video_fpath (str): Video file path
|
||||
keyframes (List[int]): List of keyframe timestamps (as counts in
|
||||
timebase units to be used in container seek operations)
|
||||
video_stream_idx (int): Video stream index (default: 0)
|
||||
Returns:
|
||||
List[Frame]: list of frames that correspond to the specified timestamps
|
||||
"""
|
||||
try:
|
||||
with PathManager.open(video_fpath, "rb") as io:
|
||||
container = av.open(io)
|
||||
stream = container.streams.video[video_stream_idx]
|
||||
frames = []
|
||||
for pts in keyframes:
|
||||
try:
|
||||
container.seek(pts, any_frame=False, stream=stream)
|
||||
frame = next(container.decode(video=0))
|
||||
frames.append(frame)
|
||||
except av.AVError as e:
|
||||
logger = logging.getLogger(__name__)
|
||||
logger.warning(
|
||||
f"Read keyframes: Error seeking video file {video_fpath}, "
|
||||
f"video stream {video_stream_idx}, pts {pts}, AV error: {e}"
|
||||
)
|
||||
container.close()
|
||||
return frames
|
||||
except OSError as e:
|
||||
logger = logging.getLogger(__name__)
|
||||
logger.warning(
|
||||
f"Read keyframes: Error seeking video file {video_fpath}, "
|
||||
f"video stream {video_stream_idx}, pts {pts}, OS error: {e}"
|
||||
)
|
||||
container.close()
|
||||
return frames
|
||||
except StopIteration:
|
||||
logger = logging.getLogger(__name__)
|
||||
logger.warning(
|
||||
f"Read keyframes: Error decoding frame from {video_fpath}, "
|
||||
f"video stream {video_stream_idx}, pts {pts}"
|
||||
)
|
||||
container.close()
|
||||
return frames
|
||||
|
||||
container.close()
|
||||
return frames
|
||||
except OSError as e:
|
||||
logger = logging.getLogger(__name__)
|
||||
logger.warning(
|
||||
f"Read keyframes: Error opening video file container {video_fpath}, OS error: {e}"
|
||||
)
|
||||
except RuntimeError as e:
|
||||
logger = logging.getLogger(__name__)
|
||||
logger.warning(
|
||||
f"Read keyframes: Error opening video file container {video_fpath}, Runtime error: {e}"
|
||||
)
|
||||
return []
|
||||
|
||||
|
||||
def video_list_from_file(video_list_fpath: str, base_path: Optional[str] = None):
|
||||
"""
|
||||
Create a list of paths to video files from a text file.
|
||||
|
||||
Args:
|
||||
video_list_fpath (str): path to a plain text file with the list of videos
|
||||
base_path (str): base path for entries from the video list (default: None)
|
||||
"""
|
||||
video_list = []
|
||||
with PathManager.open(video_list_fpath, "r") as io:
|
||||
for line in io:
|
||||
video_list.append(maybe_prepend_base_path(base_path, line.strip()))
|
||||
return video_list
|
||||
|
||||
|
||||
class VideoKeyframeDataset(Dataset):
|
||||
"""
|
||||
Dataset that provides keyframes for a set of videos.
|
||||
"""
|
||||
|
||||
_EMPTY_FRAMES = torch.empty((0, 3, 1, 1))
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
video_list: List[str],
|
||||
frame_selector: Optional[FrameSelector] = None,
|
||||
transform: Optional[FrameTransform] = None,
|
||||
):
|
||||
"""
|
||||
Dataset constructor
|
||||
|
||||
Args:
|
||||
video_list (List[str]): list of paths to video files
|
||||
frame_selector (Callable: KeyFrameList -> KeyFrameList):
|
||||
selects keyframes to process, keyframes are given by
|
||||
packet timestamps in timebase counts. If None, all keyframes
|
||||
are selected (default: None)
|
||||
transform (Callable: torch.Tensor -> torch.Tensor):
|
||||
transforms a batch of RGB images (tensors of size [B, H, W, 3]),
|
||||
returns a tensor of the same size. If None, no transform is
|
||||
applied (default: None)
|
||||
|
||||
"""
|
||||
self.video_list = video_list
|
||||
self.frame_selector = frame_selector
|
||||
self.transform = transform
|
||||
|
||||
def __getitem__(self, idx: int) -> torch.Tensor:
|
||||
"""
|
||||
Gets selected keyframes from a given video
|
||||
|
||||
Args:
|
||||
idx (int): video index in the video list file
|
||||
Returns:
|
||||
frames (torch.Tensor): tensor of size [N, H, W, 3] or of size
|
||||
defined by the transform that contains keyframes data
|
||||
"""
|
||||
fpath = self.video_list[idx]
|
||||
keyframes = list_keyframes(fpath)
|
||||
if not keyframes:
|
||||
return self._EMPTY_FRAMES
|
||||
if self.frame_selector is not None:
|
||||
keyframes = self.frame_selector(keyframes)
|
||||
frames = read_keyframes(fpath, keyframes)
|
||||
if not frames:
|
||||
return self._EMPTY_FRAMES
|
||||
frames = np.stack([frame.to_rgb().to_ndarray() for frame in frames])
|
||||
frames = torch.as_tensor(frames, device=torch.device("cpu"))
|
||||
if self.transform is not None:
|
||||
frames = self.transform(frames)
|
||||
return frames
|
||||
|
||||
def __len__(self):
|
||||
return len(self.video_list)
|
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,3 @@
|
|||
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
|
||||
|
||||
from .trainer import Trainer
|
|
@ -0,0 +1,118 @@
|
|||
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
|
||||
|
||||
import logging
|
||||
import os
|
||||
from collections import OrderedDict
|
||||
|
||||
from detectron2.checkpoint import DetectionCheckpointer
|
||||
from detectron2.config import CfgNode
|
||||
from detectron2.engine import DefaultTrainer
|
||||
from detectron2.evaluation import COCOEvaluator, DatasetEvaluators
|
||||
from detectron2.utils.events import EventWriter, get_event_storage
|
||||
|
||||
from densepose import (
|
||||
DensePoseCOCOEvaluator,
|
||||
DensePoseDatasetMapperTTA,
|
||||
DensePoseGeneralizedRCNNWithTTA,
|
||||
load_from_cfg,
|
||||
)
|
||||
from densepose.data import (
|
||||
DatasetMapper,
|
||||
build_combined_loader,
|
||||
build_detection_test_loader,
|
||||
build_detection_train_loader,
|
||||
build_inference_based_loaders,
|
||||
has_inference_based_loaders,
|
||||
)
|
||||
|
||||
|
||||
class SampleCountingLoader:
|
||||
def __init__(self, loader):
|
||||
self.loader = loader
|
||||
|
||||
def __iter__(self):
|
||||
it = iter(self.loader)
|
||||
storage = get_event_storage()
|
||||
while True:
|
||||
try:
|
||||
batch = next(it)
|
||||
num_inst_per_dataset = {}
|
||||
for data in batch:
|
||||
dataset_name = data["dataset"]
|
||||
if dataset_name not in num_inst_per_dataset:
|
||||
num_inst_per_dataset[dataset_name] = 0
|
||||
num_inst = len(data["instances"])
|
||||
num_inst_per_dataset[dataset_name] += num_inst
|
||||
for dataset_name in num_inst_per_dataset:
|
||||
storage.put_scalar(f"batch/{dataset_name}", num_inst_per_dataset[dataset_name])
|
||||
yield batch
|
||||
except StopIteration:
|
||||
break
|
||||
|
||||
|
||||
class SampleCountMetricPrinter(EventWriter):
|
||||
def __init__(self):
|
||||
self.logger = logging.getLogger(__name__)
|
||||
|
||||
def write(self):
|
||||
storage = get_event_storage()
|
||||
batch_stats_strs = []
|
||||
for key, buf in storage.histories().items():
|
||||
if key.startswith("batch/"):
|
||||
batch_stats_strs.append(f"{key} {buf.avg(20)}")
|
||||
self.logger.info(", ".join(batch_stats_strs))
|
||||
|
||||
|
||||
class Trainer(DefaultTrainer):
|
||||
@classmethod
|
||||
def build_evaluator(cls, cfg: CfgNode, dataset_name, output_folder=None):
|
||||
if output_folder is None:
|
||||
output_folder = os.path.join(cfg.OUTPUT_DIR, "inference")
|
||||
evaluators = [COCOEvaluator(dataset_name, cfg, True, output_folder)]
|
||||
if cfg.MODEL.DENSEPOSE_ON:
|
||||
evaluators.append(DensePoseCOCOEvaluator(dataset_name, True, output_folder))
|
||||
return DatasetEvaluators(evaluators)
|
||||
|
||||
@classmethod
|
||||
def build_test_loader(cls, cfg: CfgNode, dataset_name):
|
||||
return build_detection_test_loader(cfg, dataset_name, mapper=DatasetMapper(cfg, False))
|
||||
|
||||
@classmethod
|
||||
def build_train_loader(cls, cfg: CfgNode):
|
||||
data_loader = build_detection_train_loader(cfg, mapper=DatasetMapper(cfg, True))
|
||||
if not has_inference_based_loaders(cfg):
|
||||
return data_loader
|
||||
model = cls.build_model(cfg)
|
||||
model.to(cfg.BOOTSTRAP_MODEL.DEVICE)
|
||||
DetectionCheckpointer(model).resume_or_load(cfg.BOOTSTRAP_MODEL.WEIGHTS, resume=False)
|
||||
inference_based_loaders, ratios = build_inference_based_loaders(cfg, model)
|
||||
loaders = [data_loader] + inference_based_loaders
|
||||
ratios = [1.0] + ratios
|
||||
combined_data_loader = build_combined_loader(cfg, loaders, ratios)
|
||||
sample_counting_loader = SampleCountingLoader(combined_data_loader)
|
||||
return sample_counting_loader
|
||||
|
||||
def build_writers(self):
|
||||
writers = super().build_writers()
|
||||
writers.append(SampleCountMetricPrinter())
|
||||
return writers
|
||||
|
||||
@classmethod
|
||||
def test_with_TTA(cls, cfg: CfgNode, model):
|
||||
logger = logging.getLogger("detectron2.trainer")
|
||||
# In the end of training, run an evaluation with TTA
|
||||
# Only support some R-CNN models.
|
||||
logger.info("Running inference with test-time augmentation ...")
|
||||
transform_data = load_from_cfg(cfg)
|
||||
model = DensePoseGeneralizedRCNNWithTTA(
|
||||
cfg, model, transform_data, DensePoseDatasetMapperTTA(cfg)
|
||||
)
|
||||
evaluators = [
|
||||
cls.build_evaluator(
|
||||
cfg, name, output_folder=os.path.join(cfg.OUTPUT_DIR, "inference_TTA")
|
||||
)
|
||||
for name in cfg.DATASETS.TEST
|
||||
]
|
||||
res = cls.test(cfg, model, evaluators)
|
||||
res = OrderedDict({k + "_TTA": v for k, v in res.items()})
|
||||
return res
|
|
@ -0,0 +1,224 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
|
||||
|
||||
import contextlib
|
||||
import copy
|
||||
import io
|
||||
import itertools
|
||||
import logging
|
||||
import numpy as np
|
||||
import os
|
||||
from collections import OrderedDict
|
||||
import pycocotools.mask as mask_utils
|
||||
import torch
|
||||
from fvcore.common.file_io import PathManager
|
||||
from pycocotools.coco import COCO
|
||||
|
||||
from detectron2.data import MetadataCatalog
|
||||
from detectron2.evaluation import DatasetEvaluator
|
||||
from detectron2.structures import BoxMode
|
||||
from detectron2.utils.comm import all_gather, is_main_process, synchronize
|
||||
from detectron2.utils.logger import create_small_table
|
||||
|
||||
from .data.samplers import densepose_to_mask
|
||||
from .densepose_coco_evaluation import DensePoseCocoEval, DensePoseEvalMode
|
||||
|
||||
|
||||
class DensePoseCOCOEvaluator(DatasetEvaluator):
|
||||
def __init__(self, dataset_name, distributed, output_dir=None):
|
||||
self._distributed = distributed
|
||||
self._output_dir = output_dir
|
||||
|
||||
self._cpu_device = torch.device("cpu")
|
||||
self._logger = logging.getLogger(__name__)
|
||||
|
||||
self._metadata = MetadataCatalog.get(dataset_name)
|
||||
self._min_threshold = 0.5
|
||||
json_file = PathManager.get_local_path(self._metadata.json_file)
|
||||
with contextlib.redirect_stdout(io.StringIO()):
|
||||
self._coco_api = COCO(json_file)
|
||||
|
||||
def reset(self):
|
||||
self._predictions = []
|
||||
|
||||
def process(self, inputs, outputs):
|
||||
"""
|
||||
Args:
|
||||
inputs: the inputs to a COCO model (e.g., GeneralizedRCNN).
|
||||
It is a list of dict. Each dict corresponds to an image and
|
||||
contains keys like "height", "width", "file_name", "image_id".
|
||||
outputs: the outputs of a COCO model. It is a list of dicts with key
|
||||
"instances" that contains :class:`Instances`.
|
||||
The :class:`Instances` object needs to have `densepose` field.
|
||||
"""
|
||||
for input, output in zip(inputs, outputs):
|
||||
instances = output["instances"].to(self._cpu_device)
|
||||
|
||||
json_results = prediction_to_json(instances, input["image_id"])
|
||||
self._predictions.extend(json_results)
|
||||
|
||||
def evaluate(self):
|
||||
if self._distributed:
|
||||
synchronize()
|
||||
predictions = all_gather(self._predictions)
|
||||
predictions = list(itertools.chain(*predictions))
|
||||
if not is_main_process():
|
||||
return
|
||||
else:
|
||||
predictions = self._predictions
|
||||
|
||||
return copy.deepcopy(self._eval_predictions(predictions))
|
||||
|
||||
def _eval_predictions(self, predictions):
|
||||
"""
|
||||
Evaluate predictions on densepose.
|
||||
Return results with the metrics of the tasks.
|
||||
"""
|
||||
self._logger.info("Preparing results for COCO format ...")
|
||||
|
||||
if self._output_dir:
|
||||
PathManager.mkdirs(self._output_dir)
|
||||
file_path = os.path.join(self._output_dir, "coco_densepose_predictions.pth")
|
||||
with PathManager.open(file_path, "wb") as f:
|
||||
torch.save(predictions, f)
|
||||
|
||||
self._logger.info("Evaluating predictions ...")
|
||||
res = OrderedDict()
|
||||
results_gps, results_gpsm, results_segm = _evaluate_predictions_on_coco(
|
||||
self._coco_api, predictions, min_threshold=self._min_threshold
|
||||
)
|
||||
res["densepose_gps"] = results_gps
|
||||
res["densepose_gpsm"] = results_gpsm
|
||||
res["densepose_segm"] = results_segm
|
||||
return res
|
||||
|
||||
|
||||
def prediction_to_json(instances, img_id):
|
||||
"""
|
||||
Args:
|
||||
instances (Instances): the output of the model
|
||||
img_id (str): the image id in COCO
|
||||
|
||||
Returns:
|
||||
list[dict]: the results in densepose evaluation format
|
||||
"""
|
||||
scores = instances.scores.tolist()
|
||||
segmentations = densepose_to_mask(instances)
|
||||
|
||||
boxes = instances.pred_boxes.tensor.clone()
|
||||
boxes = BoxMode.convert(boxes, BoxMode.XYXY_ABS, BoxMode.XYWH_ABS)
|
||||
instances.pred_densepose = instances.pred_densepose.to_result(boxes)
|
||||
|
||||
results = []
|
||||
for k in range(len(instances)):
|
||||
densepose = instances.pred_densepose[k]
|
||||
segmentation = segmentations.tensor[k]
|
||||
segmentation_encoded = mask_utils.encode(
|
||||
np.require(segmentation.numpy(), dtype=np.uint8, requirements=["F"])
|
||||
)
|
||||
segmentation_encoded["counts"] = segmentation_encoded["counts"].decode("utf-8")
|
||||
result = {
|
||||
"image_id": img_id,
|
||||
"category_id": 1, # densepose only has one class
|
||||
"bbox": densepose[1],
|
||||
"score": scores[k],
|
||||
"densepose": densepose,
|
||||
"segmentation": segmentation_encoded,
|
||||
}
|
||||
results.append(result)
|
||||
return results
|
||||
|
||||
|
||||
def _evaluate_predictions_on_coco(coco_gt, coco_results, min_threshold=0.5):
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
segm_metrics = _get_segmentation_metrics()
|
||||
densepose_metrics = _get_densepose_metrics(min_threshold)
|
||||
if len(coco_results) == 0: # cocoapi does not handle empty results very well
|
||||
logger.warn("No predictions from the model! Set scores to -1")
|
||||
results_gps = {metric: -1 for metric in densepose_metrics}
|
||||
results_gpsm = {metric: -1 for metric in densepose_metrics}
|
||||
results_segm = {metric: -1 for metric in segm_metrics}
|
||||
return results_gps, results_gpsm, results_segm
|
||||
|
||||
coco_dt = coco_gt.loadRes(coco_results)
|
||||
results_segm = _evaluate_predictions_on_coco_segm(coco_gt, coco_dt, segm_metrics, min_threshold)
|
||||
logger.info("Evaluation results for densepose segm: \n" + create_small_table(results_segm))
|
||||
results_gps = _evaluate_predictions_on_coco_gps(
|
||||
coco_gt, coco_dt, densepose_metrics, min_threshold
|
||||
)
|
||||
logger.info(
|
||||
"Evaluation results for densepose, GPS metric: \n" + create_small_table(results_gps)
|
||||
)
|
||||
results_gpsm = _evaluate_predictions_on_coco_gpsm(
|
||||
coco_gt, coco_dt, densepose_metrics, min_threshold
|
||||
)
|
||||
logger.info(
|
||||
"Evaluation results for densepose, GPSm metric: \n" + create_small_table(results_gpsm)
|
||||
)
|
||||
return results_gps, results_gpsm, results_segm
|
||||
|
||||
|
||||
def _get_densepose_metrics(min_threshold=0.5):
|
||||
metrics = ["AP"]
|
||||
if min_threshold <= 0.201:
|
||||
metrics += ["AP20"]
|
||||
if min_threshold <= 0.301:
|
||||
metrics += ["AP30"]
|
||||
if min_threshold <= 0.401:
|
||||
metrics += ["AP40"]
|
||||
metrics.extend(["AP50", "AP75", "APm", "APl", "AR", "AR50", "AR75", "ARm", "ARl"])
|
||||
return metrics
|
||||
|
||||
|
||||
def _get_segmentation_metrics():
|
||||
return [
|
||||
"AP",
|
||||
"AP50",
|
||||
"AP75",
|
||||
"APs",
|
||||
"APm",
|
||||
"APl",
|
||||
"AR@1",
|
||||
"AR@10",
|
||||
"AR@100",
|
||||
"ARs",
|
||||
"ARm",
|
||||
"ARl",
|
||||
]
|
||||
|
||||
|
||||
def _evaluate_predictions_on_coco_gps(coco_gt, coco_dt, metrics, min_threshold=0.5):
|
||||
coco_eval = DensePoseCocoEval(coco_gt, coco_dt, "densepose", dpEvalMode=DensePoseEvalMode.GPS)
|
||||
coco_eval.params.iouThrs = np.linspace(
|
||||
min_threshold, 0.95, int(np.round((0.95 - min_threshold) / 0.05)) + 1, endpoint=True
|
||||
)
|
||||
coco_eval.evaluate()
|
||||
coco_eval.accumulate()
|
||||
coco_eval.summarize()
|
||||
results = {metric: float(coco_eval.stats[idx] * 100) for idx, metric in enumerate(metrics)}
|
||||
return results
|
||||
|
||||
|
||||
def _evaluate_predictions_on_coco_gpsm(coco_gt, coco_dt, metrics, min_threshold=0.5):
|
||||
coco_eval = DensePoseCocoEval(coco_gt, coco_dt, "densepose", dpEvalMode=DensePoseEvalMode.GPSM)
|
||||
coco_eval.params.iouThrs = np.linspace(
|
||||
min_threshold, 0.95, int(np.round((0.95 - min_threshold) / 0.05)) + 1, endpoint=True
|
||||
)
|
||||
coco_eval.evaluate()
|
||||
coco_eval.accumulate()
|
||||
coco_eval.summarize()
|
||||
results = {metric: float(coco_eval.stats[idx] * 100) for idx, metric in enumerate(metrics)}
|
||||
return results
|
||||
|
||||
|
||||
def _evaluate_predictions_on_coco_segm(coco_gt, coco_dt, metrics, min_threshold=0.5):
|
||||
coco_eval = DensePoseCocoEval(coco_gt, coco_dt, "segm")
|
||||
coco_eval.params.iouThrs = np.linspace(
|
||||
min_threshold, 0.95, int(np.round((0.95 - min_threshold) / 0.05)) + 1, endpoint=True
|
||||
)
|
||||
coco_eval.evaluate()
|
||||
coco_eval.accumulate()
|
||||
coco_eval.summarize()
|
||||
results = {metric: float(coco_eval.stats[idx] * 100) for idx, metric in enumerate(metrics)}
|
||||
return results
|
|
@ -0,0 +1,66 @@
|
|||
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
|
||||
|
||||
from detectron2.config import CfgNode
|
||||
|
||||
from .filter import DensePoseDataFilter
|
||||
from .losses import DensePoseLosses
|
||||
from .predictors import DensePoseChartWithConfidencePredictor
|
||||
|
||||
|
||||
def build_densepose_predictor(cfg: CfgNode, input_channels: int):
|
||||
"""
|
||||
Create an instance of DensePose predictor based on configuration options.
|
||||
|
||||
Args:
|
||||
cfg (CfgNode): configuration options
|
||||
input_channels (int): input tensor size along the channel dimension
|
||||
Return:
|
||||
An instance of DensePose predictor
|
||||
"""
|
||||
predictor = DensePoseChartWithConfidencePredictor(cfg, input_channels)
|
||||
return predictor
|
||||
|
||||
|
||||
def build_densepose_data_filter(cfg: CfgNode):
|
||||
"""
|
||||
Build DensePose data filter which selects data for training
|
||||
|
||||
Args:
|
||||
cfg (CfgNode): configuration options
|
||||
|
||||
Return:
|
||||
Callable: list(Tensor), list(Instances) -> list(Tensor), list(Instances)
|
||||
An instance of DensePose filter, which takes feature tensors and proposals
|
||||
as an input and returns filtered features and proposals
|
||||
"""
|
||||
dp_filter = DensePoseDataFilter(cfg)
|
||||
return dp_filter
|
||||
|
||||
|
||||
def build_densepose_head(cfg: CfgNode, input_channels: int):
|
||||
"""
|
||||
Build DensePose head based on configurations options
|
||||
|
||||
Args:
|
||||
cfg (CfgNode): configuration options
|
||||
input_channels (int): input tensor size along the channel dimension
|
||||
Return:
|
||||
An instance of DensePose head
|
||||
"""
|
||||
from .roi_heads.registry import ROI_DENSEPOSE_HEAD_REGISTRY
|
||||
|
||||
head_name = cfg.MODEL.ROI_DENSEPOSE_HEAD.NAME
|
||||
return ROI_DENSEPOSE_HEAD_REGISTRY.get(head_name)(cfg, input_channels)
|
||||
|
||||
|
||||
def build_densepose_losses(cfg: CfgNode):
|
||||
"""
|
||||
Build DensePose loss based on configurations options
|
||||
|
||||
Args:
|
||||
cfg (CfgNode): configuration options
|
||||
Return:
|
||||
An instance of DensePose loss
|
||||
"""
|
||||
losses = DensePoseLosses(cfg)
|
||||
return losses
|
|
@ -0,0 +1,73 @@
|
|||
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
|
||||
|
||||
from dataclasses import dataclass
|
||||
from enum import Enum
|
||||
|
||||
from detectron2.config import CfgNode
|
||||
|
||||
|
||||
class DensePoseUVConfidenceType(Enum):
|
||||
"""
|
||||
Statistical model type for confidence learning, possible values:
|
||||
- "iid_iso": statistically independent identically distributed residuals
|
||||
with anisotropic covariance
|
||||
- "indep_aniso": statistically independent residuals with anisotropic
|
||||
covariances
|
||||
For details, see:
|
||||
N. Neverova, D. Novotny, A. Vedaldi "Correlated Uncertainty for Learning
|
||||
Dense Correspondences from Noisy Labels", p. 918--926, in Proc. NIPS 2019
|
||||
"""
|
||||
|
||||
# fmt: off
|
||||
IID_ISO = "iid_iso"
|
||||
INDEP_ANISO = "indep_aniso"
|
||||
# fmt: on
|
||||
|
||||
|
||||
@dataclass
|
||||
class DensePoseUVConfidenceConfig:
|
||||
"""
|
||||
Configuration options for confidence on UV data
|
||||
"""
|
||||
|
||||
enabled: bool = False
|
||||
# lower bound on UV confidences
|
||||
epsilon: float = 0.01
|
||||
type: DensePoseUVConfidenceType = DensePoseUVConfidenceType.IID_ISO
|
||||
|
||||
|
||||
@dataclass
|
||||
class DensePoseSegmConfidenceConfig:
|
||||
"""
|
||||
Configuration options for confidence on segmentation
|
||||
"""
|
||||
|
||||
enabled: bool = False
|
||||
# lower bound on confidence values
|
||||
epsilon: float = 0.01
|
||||
|
||||
|
||||
@dataclass
|
||||
class DensePoseConfidenceModelConfig:
|
||||
"""
|
||||
Configuration options for confidence models
|
||||
"""
|
||||
|
||||
# confidence for U and V values
|
||||
uv_confidence: DensePoseUVConfidenceConfig
|
||||
# segmentation confidence
|
||||
segm_confidence: DensePoseSegmConfidenceConfig
|
||||
|
||||
@staticmethod
|
||||
def from_cfg(cfg: CfgNode) -> "DensePoseConfidenceModelConfig":
|
||||
return DensePoseConfidenceModelConfig(
|
||||
uv_confidence=DensePoseUVConfidenceConfig(
|
||||
enabled=cfg.MODEL.ROI_DENSEPOSE_HEAD.UV_CONFIDENCE.ENABLED,
|
||||
epsilon=cfg.MODEL.ROI_DENSEPOSE_HEAD.UV_CONFIDENCE.EPSILON,
|
||||
type=DensePoseUVConfidenceType(cfg.MODEL.ROI_DENSEPOSE_HEAD.UV_CONFIDENCE.TYPE),
|
||||
),
|
||||
segm_confidence=DensePoseSegmConfidenceConfig(
|
||||
enabled=cfg.MODEL.ROI_DENSEPOSE_HEAD.SEGM_CONFIDENCE.ENABLED,
|
||||
epsilon=cfg.MODEL.ROI_DENSEPOSE_HEAD.SEGM_CONFIDENCE.EPSILON,
|
||||
),
|
||||
)
|
|
@ -0,0 +1,35 @@
|
|||
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
|
||||
from collections import OrderedDict
|
||||
|
||||
from detectron2.checkpoint import DetectionCheckpointer
|
||||
|
||||
|
||||
def _rename_HRNet_weights(weights):
|
||||
# We detect and rename HRNet weights for DensePose. 1956 and 1716 are values that are
|
||||
# common to all HRNet pretrained weights, and should be enough to accurately identify them
|
||||
if (
|
||||
len(weights["model"].keys()) == 1956
|
||||
and len([k for k in weights["model"].keys() if k.startswith("stage")]) == 1716
|
||||
):
|
||||
hrnet_weights = OrderedDict()
|
||||
for k in weights["model"].keys():
|
||||
hrnet_weights["backbone.bottom_up." + str(k)] = weights["model"][k]
|
||||
return {"model": hrnet_weights}
|
||||
else:
|
||||
return weights
|
||||
|
||||
|
||||
class DensePoseCheckpointer(DetectionCheckpointer):
|
||||
"""
|
||||
Same as :class:`DetectionCheckpointer`, but is able to handle HRNet weights
|
||||
"""
|
||||
|
||||
def __init__(self, model, save_dir="", *, save_to_disk=None, **checkpointables):
|
||||
super().__init__(model, save_dir, save_to_disk=save_to_disk, **checkpointables)
|
||||
|
||||
def _load_file(self, filename: str) -> object:
|
||||
"""
|
||||
Adding hrnet support
|
||||
"""
|
||||
weights = super()._load_file(filename)
|
||||
return _rename_HRNet_weights(weights)
|
|
@ -0,0 +1,94 @@
|
|||
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
|
||||
|
||||
from typing import List
|
||||
import torch
|
||||
|
||||
from detectron2.config import CfgNode
|
||||
from detectron2.structures import Instances
|
||||
from detectron2.structures.boxes import matched_boxlist_iou
|
||||
|
||||
|
||||
class DensePoseDataFilter(object):
|
||||
def __init__(self, cfg: CfgNode):
|
||||
self.iou_threshold = cfg.MODEL.ROI_DENSEPOSE_HEAD.FG_IOU_THRESHOLD
|
||||
self.keep_masks = cfg.MODEL.ROI_DENSEPOSE_HEAD.COARSE_SEGM_TRAINED_BY_MASKS
|
||||
|
||||
@torch.no_grad()
|
||||
def __call__(self, features: List[torch.Tensor], proposals_with_targets: List[Instances]):
|
||||
"""
|
||||
Filters proposals with targets to keep only the ones relevant for
|
||||
DensePose training
|
||||
|
||||
Args:
|
||||
features (list[Tensor]): input data as a list of features,
|
||||
each feature is a tensor. Axis 0 represents the number of
|
||||
images `N` in the input data; axes 1-3 are channels,
|
||||
height, and width, which may vary between features
|
||||
(e.g., if a feature pyramid is used).
|
||||
proposals_with_targets (list[Instances]): length `N` list of
|
||||
`Instances`. The i-th `Instances` contains instances
|
||||
(proposals, GT) for the i-th input image,
|
||||
Returns:
|
||||
list[Tensor]: filtered features
|
||||
list[Instances]: filtered proposals
|
||||
"""
|
||||
proposals_filtered = []
|
||||
# TODO: the commented out code was supposed to correctly deal with situations
|
||||
# where no valid DensePose GT is available for certain images. The corresponding
|
||||
# image features were sliced and proposals were filtered. This led to performance
|
||||
# deterioration, both in terms of runtime and in terms of evaluation results.
|
||||
#
|
||||
# feature_mask = torch.ones(
|
||||
# len(proposals_with_targets),
|
||||
# dtype=torch.bool,
|
||||
# device=features[0].device if len(features) > 0 else torch.device("cpu"),
|
||||
# )
|
||||
for i, proposals_per_image in enumerate(proposals_with_targets):
|
||||
if not proposals_per_image.has("gt_densepose") and (
|
||||
not proposals_per_image.has("gt_masks") or not self.keep_masks
|
||||
):
|
||||
# feature_mask[i] = 0
|
||||
continue
|
||||
gt_boxes = proposals_per_image.gt_boxes
|
||||
est_boxes = proposals_per_image.proposal_boxes
|
||||
# apply match threshold for densepose head
|
||||
iou = matched_boxlist_iou(gt_boxes, est_boxes)
|
||||
iou_select = iou > self.iou_threshold
|
||||
proposals_per_image = proposals_per_image[iou_select]
|
||||
|
||||
N_gt_boxes = len(proposals_per_image.gt_boxes)
|
||||
assert N_gt_boxes == len(proposals_per_image.proposal_boxes), (
|
||||
f"The number of GT boxes {N_gt_boxes} is different from the "
|
||||
f"number of proposal boxes {len(proposals_per_image.proposal_boxes)}"
|
||||
)
|
||||
# filter out any target without suitable annotation
|
||||
if self.keep_masks:
|
||||
gt_masks = (
|
||||
proposals_per_image.gt_masks
|
||||
if hasattr(proposals_per_image, "gt_masks")
|
||||
else [None] * N_gt_boxes
|
||||
)
|
||||
else:
|
||||
gt_masks = [None] * N_gt_boxes
|
||||
gt_densepose = (
|
||||
proposals_per_image.gt_densepose
|
||||
if hasattr(proposals_per_image, "gt_densepose")
|
||||
else [None] * N_gt_boxes
|
||||
)
|
||||
assert len(gt_masks) == N_gt_boxes
|
||||
assert len(gt_densepose) == N_gt_boxes
|
||||
selected_indices = [
|
||||
i
|
||||
for i, (dp_target, mask_target) in enumerate(zip(gt_densepose, gt_masks))
|
||||
if (dp_target is not None) or (mask_target is not None)
|
||||
]
|
||||
# if not len(selected_indices):
|
||||
# feature_mask[i] = 0
|
||||
# continue
|
||||
if len(selected_indices) != N_gt_boxes:
|
||||
proposals_per_image = proposals_per_image[selected_indices]
|
||||
assert len(proposals_per_image.gt_boxes) == len(proposals_per_image.proposal_boxes)
|
||||
proposals_filtered.append(proposals_per_image)
|
||||
# features_filtered = [feature[feature_mask] for feature in features]
|
||||
# return features_filtered, proposals_filtered
|
||||
return features, proposals_filtered
|
|
@ -0,0 +1,181 @@
|
|||
"""
|
||||
MIT License
|
||||
Copyright (c) 2019 Microsoft
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
The above copyright notice and this permission notice shall be included in all
|
||||
copies or substantial portions of the Software.
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
SOFTWARE.
|
||||
"""
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import torch.nn.functional as F
|
||||
|
||||
from detectron2.layers import ShapeSpec
|
||||
from detectron2.modeling.backbone import BACKBONE_REGISTRY
|
||||
from detectron2.modeling.backbone.backbone import Backbone
|
||||
|
||||
from .hrnet import build_pose_hrnet_backbone
|
||||
|
||||
|
||||
class HRFPN(Backbone):
|
||||
""" HRFPN (High Resolution Feature Pyramids)
|
||||
Transforms outputs of HRNet backbone so they are suitable for the ROI_heads
|
||||
arXiv: https://arxiv.org/abs/1904.04514
|
||||
Adapted from https://github.com/open-mmlab/mmdetection/blob/master/mmdet/models/necks/hrfpn.py
|
||||
Args:
|
||||
bottom_up: (list) output of HRNet
|
||||
in_features (list): names of the input features (output of HRNet)
|
||||
in_channels (list): number of channels for each branch
|
||||
out_channels (int): output channels of feature pyramids
|
||||
n_out_features (int): number of output stages
|
||||
pooling (str): pooling for generating feature pyramids (from {MAX, AVG})
|
||||
share_conv (bool): Have one conv per output, or share one with all the outputs
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
bottom_up,
|
||||
in_features,
|
||||
n_out_features,
|
||||
in_channels,
|
||||
out_channels,
|
||||
pooling="AVG",
|
||||
share_conv=False,
|
||||
):
|
||||
super(HRFPN, self).__init__()
|
||||
assert isinstance(in_channels, list)
|
||||
self.bottom_up = bottom_up
|
||||
self.in_features = in_features
|
||||
self.n_out_features = n_out_features
|
||||
self.in_channels = in_channels
|
||||
self.out_channels = out_channels
|
||||
self.num_ins = len(in_channels)
|
||||
self.share_conv = share_conv
|
||||
|
||||
if self.share_conv:
|
||||
self.fpn_conv = nn.Conv2d(
|
||||
in_channels=out_channels, out_channels=out_channels, kernel_size=3, padding=1
|
||||
)
|
||||
else:
|
||||
self.fpn_conv = nn.ModuleList()
|
||||
for _ in range(self.n_out_features):
|
||||
self.fpn_conv.append(
|
||||
nn.Conv2d(
|
||||
in_channels=out_channels,
|
||||
out_channels=out_channels,
|
||||
kernel_size=3,
|
||||
padding=1,
|
||||
)
|
||||
)
|
||||
|
||||
# Custom change: Replaces a simple bilinear interpolation
|
||||
self.interp_conv = nn.ModuleList()
|
||||
for i in range(len(self.in_features)):
|
||||
self.interp_conv.append(
|
||||
nn.Sequential(
|
||||
nn.ConvTranspose2d(
|
||||
in_channels=in_channels[i],
|
||||
out_channels=in_channels[i],
|
||||
kernel_size=4,
|
||||
stride=2 ** i,
|
||||
padding=0,
|
||||
output_padding=0,
|
||||
bias=False,
|
||||
),
|
||||
nn.BatchNorm2d(in_channels[i], momentum=0.1),
|
||||
nn.ReLU(inplace=True),
|
||||
)
|
||||
)
|
||||
|
||||
# Custom change: Replaces a couple (reduction conv + pooling) by one conv
|
||||
self.reduction_pooling_conv = nn.ModuleList()
|
||||
for i in range(self.n_out_features):
|
||||
self.reduction_pooling_conv.append(
|
||||
nn.Sequential(
|
||||
nn.Conv2d(sum(in_channels), out_channels, kernel_size=2 ** i, stride=2 ** i),
|
||||
nn.BatchNorm2d(out_channels, momentum=0.1),
|
||||
nn.ReLU(inplace=True),
|
||||
)
|
||||
)
|
||||
|
||||
if pooling == "MAX":
|
||||
self.pooling = F.max_pool2d
|
||||
else:
|
||||
self.pooling = F.avg_pool2d
|
||||
|
||||
self._out_features = []
|
||||
self._out_feature_channels = {}
|
||||
self._out_feature_strides = {}
|
||||
|
||||
for i in range(self.n_out_features):
|
||||
self._out_features.append("p%d" % (i + 1))
|
||||
self._out_feature_channels.update({self._out_features[-1]: self.out_channels})
|
||||
self._out_feature_strides.update({self._out_features[-1]: 2 ** (i + 2)})
|
||||
|
||||
# default init_weights for conv(msra) and norm in ConvModule
|
||||
def init_weights(self):
|
||||
for m in self.modules():
|
||||
if isinstance(m, nn.Conv2d):
|
||||
nn.init.kaiming_normal_(m.weight, a=1)
|
||||
nn.init.constant_(m.bias, 0)
|
||||
|
||||
def forward(self, inputs):
|
||||
bottom_up_features = self.bottom_up(inputs)
|
||||
assert len(bottom_up_features) == len(self.in_features)
|
||||
inputs = [bottom_up_features[f] for f in self.in_features]
|
||||
|
||||
outs = []
|
||||
for i in range(len(inputs)):
|
||||
outs.append(self.interp_conv[i](inputs[i]))
|
||||
shape_2 = min(o.shape[2] for o in outs)
|
||||
shape_3 = min(o.shape[3] for o in outs)
|
||||
out = torch.cat([o[:, :, :shape_2, :shape_3] for o in outs], dim=1)
|
||||
outs = []
|
||||
for i in range(self.n_out_features):
|
||||
outs.append(self.reduction_pooling_conv[i](out))
|
||||
for i in range(len(outs)): # Make shapes consistent
|
||||
outs[-1 - i] = outs[-1 - i][
|
||||
:, :, : outs[-1].shape[2] * 2 ** i, : outs[-1].shape[3] * 2 ** i
|
||||
]
|
||||
outputs = []
|
||||
for i in range(len(outs)):
|
||||
if self.share_conv:
|
||||
outputs.append(self.fpn_conv(outs[i]))
|
||||
else:
|
||||
outputs.append(self.fpn_conv[i](outs[i]))
|
||||
|
||||
assert len(self._out_features) == len(outputs)
|
||||
return dict(zip(self._out_features, outputs))
|
||||
|
||||
|
||||
@BACKBONE_REGISTRY.register()
|
||||
def build_hrfpn_backbone(cfg, input_shape: ShapeSpec):
|
||||
|
||||
in_channels = cfg.MODEL.HRNET.STAGE4.NUM_CHANNELS
|
||||
in_features = ["p%d" % (i + 1) for i in range(cfg.MODEL.HRNET.STAGE4.NUM_BRANCHES)]
|
||||
n_out_features = len(cfg.MODEL.ROI_HEADS.IN_FEATURES)
|
||||
out_channels = cfg.MODEL.HRNET.HRFPN.OUT_CHANNELS
|
||||
hrnet = build_pose_hrnet_backbone(cfg, input_shape)
|
||||
hrfpn = HRFPN(
|
||||
hrnet,
|
||||
in_features,
|
||||
n_out_features,
|
||||
in_channels,
|
||||
out_channels,
|
||||
pooling="AVG",
|
||||
share_conv=False,
|
||||
)
|
||||
|
||||
return hrfpn
|
|
@ -0,0 +1,473 @@
|
|||
# ------------------------------------------------------------------------------
|
||||
# Copyright (c) Microsoft
|
||||
# Licensed under the MIT License.
|
||||
# Written by Bin Xiao (leoxiaobin@gmail.com)
|
||||
# Modified by Bowen Cheng (bcheng9@illinois.edu)
|
||||
# Adapted from https://github.com/HRNet/Higher-HRNet-Human-Pose-Estimation/blob/master/lib/models/pose_higher_hrnet.py # noqa
|
||||
# ------------------------------------------------------------------------------
|
||||
|
||||
from __future__ import absolute_import, division, print_function
|
||||
import logging
|
||||
import torch.nn as nn
|
||||
|
||||
from detectron2.layers import ShapeSpec
|
||||
from detectron2.modeling.backbone import BACKBONE_REGISTRY
|
||||
from detectron2.modeling.backbone.backbone import Backbone
|
||||
|
||||
BN_MOMENTUM = 0.1
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
__all__ = ["build_pose_hrnet_backbone", "PoseHigherResolutionNet"]
|
||||
|
||||
|
||||
def conv3x3(in_planes, out_planes, stride=1):
|
||||
"""3x3 convolution with padding"""
|
||||
return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride, padding=1, bias=False)
|
||||
|
||||
|
||||
class BasicBlock(nn.Module):
|
||||
expansion = 1
|
||||
|
||||
def __init__(self, inplanes, planes, stride=1, downsample=None):
|
||||
super(BasicBlock, self).__init__()
|
||||
self.conv1 = conv3x3(inplanes, planes, stride)
|
||||
self.bn1 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM)
|
||||
self.relu = nn.ReLU(inplace=True)
|
||||
self.conv2 = conv3x3(planes, planes)
|
||||
self.bn2 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM)
|
||||
self.downsample = downsample
|
||||
self.stride = stride
|
||||
|
||||
def forward(self, x):
|
||||
residual = x
|
||||
|
||||
out = self.conv1(x)
|
||||
out = self.bn1(out)
|
||||
out = self.relu(out)
|
||||
|
||||
out = self.conv2(out)
|
||||
out = self.bn2(out)
|
||||
|
||||
if self.downsample is not None:
|
||||
residual = self.downsample(x)
|
||||
|
||||
out += residual
|
||||
out = self.relu(out)
|
||||
|
||||
return out
|
||||
|
||||
|
||||
class Bottleneck(nn.Module):
|
||||
expansion = 4
|
||||
|
||||
def __init__(self, inplanes, planes, stride=1, downsample=None):
|
||||
super(Bottleneck, self).__init__()
|
||||
self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False)
|
||||
self.bn1 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM)
|
||||
self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride, padding=1, bias=False)
|
||||
self.bn2 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM)
|
||||
self.conv3 = nn.Conv2d(planes, planes * self.expansion, kernel_size=1, bias=False)
|
||||
self.bn3 = nn.BatchNorm2d(planes * self.expansion, momentum=BN_MOMENTUM)
|
||||
self.relu = nn.ReLU(inplace=True)
|
||||
self.downsample = downsample
|
||||
self.stride = stride
|
||||
|
||||
def forward(self, x):
|
||||
residual = x
|
||||
|
||||
out = self.conv1(x)
|
||||
out = self.bn1(out)
|
||||
out = self.relu(out)
|
||||
|
||||
out = self.conv2(out)
|
||||
out = self.bn2(out)
|
||||
out = self.relu(out)
|
||||
|
||||
out = self.conv3(out)
|
||||
out = self.bn3(out)
|
||||
|
||||
if self.downsample is not None:
|
||||
residual = self.downsample(x)
|
||||
|
||||
out += residual
|
||||
out = self.relu(out)
|
||||
|
||||
return out
|
||||
|
||||
|
||||
class HighResolutionModule(nn.Module):
|
||||
""" HighResolutionModule
|
||||
Building block of the PoseHigherResolutionNet (see lower)
|
||||
arXiv: https://arxiv.org/abs/1908.10357
|
||||
Args:
|
||||
num_branches (int): number of branches of the modyle
|
||||
blocks (str): type of block of the module
|
||||
num_blocks (int): number of blocks of the module
|
||||
num_inchannels (int): number of input channels of the module
|
||||
num_channels (list): number of channels of each branch
|
||||
multi_scale_output (bool): only used by the last module of PoseHigherResolutionNet
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
num_branches,
|
||||
blocks,
|
||||
num_blocks,
|
||||
num_inchannels,
|
||||
num_channels,
|
||||
multi_scale_output=True,
|
||||
):
|
||||
super(HighResolutionModule, self).__init__()
|
||||
self._check_branches(num_branches, blocks, num_blocks, num_inchannels, num_channels)
|
||||
|
||||
self.num_inchannels = num_inchannels
|
||||
self.num_branches = num_branches
|
||||
|
||||
self.multi_scale_output = multi_scale_output
|
||||
|
||||
self.branches = self._make_branches(num_branches, blocks, num_blocks, num_channels)
|
||||
self.fuse_layers = self._make_fuse_layers()
|
||||
self.relu = nn.ReLU(True)
|
||||
|
||||
def _check_branches(self, num_branches, blocks, num_blocks, num_inchannels, num_channels):
|
||||
if num_branches != len(num_blocks):
|
||||
error_msg = "NUM_BRANCHES({}) <> NUM_BLOCKS({})".format(num_branches, len(num_blocks))
|
||||
logger.error(error_msg)
|
||||
raise ValueError(error_msg)
|
||||
|
||||
if num_branches != len(num_channels):
|
||||
error_msg = "NUM_BRANCHES({}) <> NUM_CHANNELS({})".format(
|
||||
num_branches, len(num_channels)
|
||||
)
|
||||
logger.error(error_msg)
|
||||
raise ValueError(error_msg)
|
||||
|
||||
if num_branches != len(num_inchannels):
|
||||
error_msg = "NUM_BRANCHES({}) <> NUM_INCHANNELS({})".format(
|
||||
num_branches, len(num_inchannels)
|
||||
)
|
||||
logger.error(error_msg)
|
||||
raise ValueError(error_msg)
|
||||
|
||||
def _make_one_branch(self, branch_index, block, num_blocks, num_channels, stride=1):
|
||||
downsample = None
|
||||
if (
|
||||
stride != 1
|
||||
or self.num_inchannels[branch_index] != num_channels[branch_index] * block.expansion
|
||||
):
|
||||
downsample = nn.Sequential(
|
||||
nn.Conv2d(
|
||||
self.num_inchannels[branch_index],
|
||||
num_channels[branch_index] * block.expansion,
|
||||
kernel_size=1,
|
||||
stride=stride,
|
||||
bias=False,
|
||||
),
|
||||
nn.BatchNorm2d(num_channels[branch_index] * block.expansion, momentum=BN_MOMENTUM),
|
||||
)
|
||||
|
||||
layers = []
|
||||
layers.append(
|
||||
block(self.num_inchannels[branch_index], num_channels[branch_index], stride, downsample)
|
||||
)
|
||||
self.num_inchannels[branch_index] = num_channels[branch_index] * block.expansion
|
||||
for _ in range(1, num_blocks[branch_index]):
|
||||
layers.append(block(self.num_inchannels[branch_index], num_channels[branch_index]))
|
||||
|
||||
return nn.Sequential(*layers)
|
||||
|
||||
def _make_branches(self, num_branches, block, num_blocks, num_channels):
|
||||
branches = []
|
||||
|
||||
for i in range(num_branches):
|
||||
branches.append(self._make_one_branch(i, block, num_blocks, num_channels))
|
||||
|
||||
return nn.ModuleList(branches)
|
||||
|
||||
def _make_fuse_layers(self):
|
||||
if self.num_branches == 1:
|
||||
return None
|
||||
|
||||
num_branches = self.num_branches
|
||||
num_inchannels = self.num_inchannels
|
||||
fuse_layers = []
|
||||
for i in range(num_branches if self.multi_scale_output else 1):
|
||||
fuse_layer = []
|
||||
for j in range(num_branches):
|
||||
if j > i:
|
||||
fuse_layer.append(
|
||||
nn.Sequential(
|
||||
nn.Conv2d(num_inchannels[j], num_inchannels[i], 1, 1, 0, bias=False),
|
||||
nn.BatchNorm2d(num_inchannels[i]),
|
||||
nn.Upsample(scale_factor=2 ** (j - i), mode="nearest"),
|
||||
)
|
||||
)
|
||||
elif j == i:
|
||||
fuse_layer.append(None)
|
||||
else:
|
||||
conv3x3s = []
|
||||
for k in range(i - j):
|
||||
if k == i - j - 1:
|
||||
num_outchannels_conv3x3 = num_inchannels[i]
|
||||
conv3x3s.append(
|
||||
nn.Sequential(
|
||||
nn.Conv2d(
|
||||
num_inchannels[j],
|
||||
num_outchannels_conv3x3,
|
||||
3,
|
||||
2,
|
||||
1,
|
||||
bias=False,
|
||||
),
|
||||
nn.BatchNorm2d(num_outchannels_conv3x3),
|
||||
)
|
||||
)
|
||||
else:
|
||||
num_outchannels_conv3x3 = num_inchannels[j]
|
||||
conv3x3s.append(
|
||||
nn.Sequential(
|
||||
nn.Conv2d(
|
||||
num_inchannels[j],
|
||||
num_outchannels_conv3x3,
|
||||
3,
|
||||
2,
|
||||
1,
|
||||
bias=False,
|
||||
),
|
||||
nn.BatchNorm2d(num_outchannels_conv3x3),
|
||||
nn.ReLU(True),
|
||||
)
|
||||
)
|
||||
fuse_layer.append(nn.Sequential(*conv3x3s))
|
||||
fuse_layers.append(nn.ModuleList(fuse_layer))
|
||||
|
||||
return nn.ModuleList(fuse_layers)
|
||||
|
||||
def get_num_inchannels(self):
|
||||
return self.num_inchannels
|
||||
|
||||
def forward(self, x):
|
||||
if self.num_branches == 1:
|
||||
return [self.branches[0](x[0])]
|
||||
|
||||
for i in range(self.num_branches):
|
||||
x[i] = self.branches[i](x[i])
|
||||
|
||||
x_fuse = []
|
||||
|
||||
for i in range(len(self.fuse_layers)):
|
||||
y = x[0] if i == 0 else self.fuse_layers[i][0](x[0])
|
||||
for j in range(1, self.num_branches):
|
||||
if i == j:
|
||||
y = y + x[j]
|
||||
else:
|
||||
z = self.fuse_layers[i][j](x[j])[:, :, : y.shape[2], : y.shape[3]]
|
||||
y = y + z
|
||||
x_fuse.append(self.relu(y))
|
||||
|
||||
return x_fuse
|
||||
|
||||
|
||||
blocks_dict = {"BASIC": BasicBlock, "BOTTLENECK": Bottleneck}
|
||||
|
||||
|
||||
class PoseHigherResolutionNet(Backbone):
|
||||
""" PoseHigherResolutionNet
|
||||
Composed of several HighResolutionModule tied together with ConvNets
|
||||
Adapted from the GitHub version to fit with HRFPN and the Detectron2 infrastructure
|
||||
arXiv: https://arxiv.org/abs/1908.10357
|
||||
"""
|
||||
|
||||
def __init__(self, cfg, **kwargs):
|
||||
self.inplanes = cfg.MODEL.HRNET.STEM_INPLANES
|
||||
super(PoseHigherResolutionNet, self).__init__()
|
||||
|
||||
# stem net
|
||||
self.conv1 = nn.Conv2d(3, 64, kernel_size=3, stride=2, padding=1, bias=False)
|
||||
self.bn1 = nn.BatchNorm2d(64, momentum=BN_MOMENTUM)
|
||||
self.conv2 = nn.Conv2d(64, 64, kernel_size=3, stride=2, padding=1, bias=False)
|
||||
self.bn2 = nn.BatchNorm2d(64, momentum=BN_MOMENTUM)
|
||||
self.relu = nn.ReLU(inplace=True)
|
||||
self.layer1 = self._make_layer(Bottleneck, 64, 4)
|
||||
|
||||
self.stage2_cfg = cfg.MODEL.HRNET.STAGE2
|
||||
num_channels = self.stage2_cfg.NUM_CHANNELS
|
||||
block = blocks_dict[self.stage2_cfg.BLOCK]
|
||||
num_channels = [num_channels[i] * block.expansion for i in range(len(num_channels))]
|
||||
self.transition1 = self._make_transition_layer([256], num_channels)
|
||||
self.stage2, pre_stage_channels = self._make_stage(self.stage2_cfg, num_channels)
|
||||
|
||||
self.stage3_cfg = cfg.MODEL.HRNET.STAGE3
|
||||
num_channels = self.stage3_cfg.NUM_CHANNELS
|
||||
block = blocks_dict[self.stage3_cfg.BLOCK]
|
||||
num_channels = [num_channels[i] * block.expansion for i in range(len(num_channels))]
|
||||
self.transition2 = self._make_transition_layer(pre_stage_channels, num_channels)
|
||||
self.stage3, pre_stage_channels = self._make_stage(self.stage3_cfg, num_channels)
|
||||
|
||||
self.stage4_cfg = cfg.MODEL.HRNET.STAGE4
|
||||
num_channels = self.stage4_cfg.NUM_CHANNELS
|
||||
block = blocks_dict[self.stage4_cfg.BLOCK]
|
||||
num_channels = [num_channels[i] * block.expansion for i in range(len(num_channels))]
|
||||
self.transition3 = self._make_transition_layer(pre_stage_channels, num_channels)
|
||||
self.stage4, pre_stage_channels = self._make_stage(
|
||||
self.stage4_cfg, num_channels, multi_scale_output=True
|
||||
)
|
||||
|
||||
self._out_features = []
|
||||
self._out_feature_channels = {}
|
||||
self._out_feature_strides = {}
|
||||
|
||||
for i in range(cfg.MODEL.HRNET.STAGE4.NUM_BRANCHES):
|
||||
self._out_features.append("p%d" % (i + 1))
|
||||
self._out_feature_channels.update(
|
||||
{self._out_features[-1]: cfg.MODEL.HRNET.STAGE4.NUM_CHANNELS[i]}
|
||||
)
|
||||
self._out_feature_strides.update({self._out_features[-1]: 1})
|
||||
|
||||
def _get_deconv_cfg(self, deconv_kernel):
|
||||
if deconv_kernel == 4:
|
||||
padding = 1
|
||||
output_padding = 0
|
||||
elif deconv_kernel == 3:
|
||||
padding = 1
|
||||
output_padding = 1
|
||||
elif deconv_kernel == 2:
|
||||
padding = 0
|
||||
output_padding = 0
|
||||
|
||||
return deconv_kernel, padding, output_padding
|
||||
|
||||
def _make_transition_layer(self, num_channels_pre_layer, num_channels_cur_layer):
|
||||
num_branches_cur = len(num_channels_cur_layer)
|
||||
num_branches_pre = len(num_channels_pre_layer)
|
||||
|
||||
transition_layers = []
|
||||
for i in range(num_branches_cur):
|
||||
if i < num_branches_pre:
|
||||
if num_channels_cur_layer[i] != num_channels_pre_layer[i]:
|
||||
transition_layers.append(
|
||||
nn.Sequential(
|
||||
nn.Conv2d(
|
||||
num_channels_pre_layer[i],
|
||||
num_channels_cur_layer[i],
|
||||
3,
|
||||
1,
|
||||
1,
|
||||
bias=False,
|
||||
),
|
||||
nn.BatchNorm2d(num_channels_cur_layer[i]),
|
||||
nn.ReLU(inplace=True),
|
||||
)
|
||||
)
|
||||
else:
|
||||
transition_layers.append(None)
|
||||
else:
|
||||
conv3x3s = []
|
||||
for j in range(i + 1 - num_branches_pre):
|
||||
inchannels = num_channels_pre_layer[-1]
|
||||
outchannels = (
|
||||
num_channels_cur_layer[i] if j == i - num_branches_pre else inchannels
|
||||
)
|
||||
conv3x3s.append(
|
||||
nn.Sequential(
|
||||
nn.Conv2d(inchannels, outchannels, 3, 2, 1, bias=False),
|
||||
nn.BatchNorm2d(outchannels),
|
||||
nn.ReLU(inplace=True),
|
||||
)
|
||||
)
|
||||
transition_layers.append(nn.Sequential(*conv3x3s))
|
||||
|
||||
return nn.ModuleList(transition_layers)
|
||||
|
||||
def _make_layer(self, block, planes, blocks, stride=1):
|
||||
downsample = None
|
||||
if stride != 1 or self.inplanes != planes * block.expansion:
|
||||
downsample = nn.Sequential(
|
||||
nn.Conv2d(
|
||||
self.inplanes,
|
||||
planes * block.expansion,
|
||||
kernel_size=1,
|
||||
stride=stride,
|
||||
bias=False,
|
||||
),
|
||||
nn.BatchNorm2d(planes * block.expansion, momentum=BN_MOMENTUM),
|
||||
)
|
||||
|
||||
layers = []
|
||||
layers.append(block(self.inplanes, planes, stride, downsample))
|
||||
self.inplanes = planes * block.expansion
|
||||
for _ in range(1, blocks):
|
||||
layers.append(block(self.inplanes, planes))
|
||||
|
||||
return nn.Sequential(*layers)
|
||||
|
||||
def _make_stage(self, layer_config, num_inchannels, multi_scale_output=True):
|
||||
num_modules = layer_config["NUM_MODULES"]
|
||||
num_branches = layer_config["NUM_BRANCHES"]
|
||||
num_blocks = layer_config["NUM_BLOCKS"]
|
||||
num_channels = layer_config["NUM_CHANNELS"]
|
||||
block = blocks_dict[layer_config["BLOCK"]]
|
||||
|
||||
modules = []
|
||||
for i in range(num_modules):
|
||||
# multi_scale_output is only used last module
|
||||
if not multi_scale_output and i == num_modules - 1:
|
||||
reset_multi_scale_output = False
|
||||
else:
|
||||
reset_multi_scale_output = True
|
||||
|
||||
modules.append(
|
||||
HighResolutionModule(
|
||||
num_branches,
|
||||
block,
|
||||
num_blocks,
|
||||
num_inchannels,
|
||||
num_channels,
|
||||
reset_multi_scale_output,
|
||||
)
|
||||
)
|
||||
num_inchannels = modules[-1].get_num_inchannels()
|
||||
|
||||
return nn.Sequential(*modules), num_inchannels
|
||||
|
||||
def forward(self, x):
|
||||
x = self.conv1(x)
|
||||
x = self.bn1(x)
|
||||
x = self.relu(x)
|
||||
x = self.conv2(x)
|
||||
x = self.bn2(x)
|
||||
x = self.relu(x)
|
||||
x = self.layer1(x)
|
||||
|
||||
x_list = []
|
||||
for i in range(self.stage2_cfg.NUM_BRANCHES):
|
||||
if self.transition1[i] is not None:
|
||||
x_list.append(self.transition1[i](x))
|
||||
else:
|
||||
x_list.append(x)
|
||||
y_list = self.stage2(x_list)
|
||||
|
||||
x_list = []
|
||||
for i in range(self.stage3_cfg.NUM_BRANCHES):
|
||||
if self.transition2[i] is not None:
|
||||
x_list.append(self.transition2[i](y_list[-1]))
|
||||
else:
|
||||
x_list.append(y_list[i])
|
||||
y_list = self.stage3(x_list)
|
||||
|
||||
x_list = []
|
||||
for i in range(self.stage4_cfg.NUM_BRANCHES):
|
||||
if self.transition3[i] is not None:
|
||||
x_list.append(self.transition3[i](y_list[-1]))
|
||||
else:
|
||||
x_list.append(y_list[i])
|
||||
y_list = self.stage4(x_list)
|
||||
|
||||
assert len(self._out_features) == len(y_list)
|
||||
return dict(zip(self._out_features, y_list)) # final_outputs
|
||||
|
||||
|
||||
@BACKBONE_REGISTRY.register()
|
||||
def build_pose_hrnet_backbone(cfg, input_shape: ShapeSpec):
|
||||
model = PoseHigherResolutionNet(cfg)
|
||||
return model
|
|
@ -0,0 +1,83 @@
|
|||
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
|
||||
from typing import List, Tuple
|
||||
import torch
|
||||
|
||||
from detectron2.structures import Instances
|
||||
|
||||
from ..data.structures import DensePoseOutput
|
||||
|
||||
|
||||
def densepose_inference(
|
||||
densepose_outputs: Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor],
|
||||
densepose_confidences: Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor],
|
||||
detections: List[Instances],
|
||||
):
|
||||
"""
|
||||
Infer dense pose estimate based on outputs from the DensePose head
|
||||
and detections. The estimate for each detection instance is stored in its
|
||||
"pred_densepose" attribute.
|
||||
|
||||
Args:
|
||||
densepose_outputs (tuple(`torch.Tensor`)): iterable containing 4 elements:
|
||||
- s (:obj: `torch.Tensor`): coarse segmentation tensor of size (N, A, H, W),
|
||||
- i (:obj: `torch.Tensor`): fine segmentation tensor of size (N, C, H, W),
|
||||
- u (:obj: `torch.Tensor`): U coordinates for each class of size (N, C, H, W),
|
||||
- v (:obj: `torch.Tensor`): V coordinates for each class of size (N, C, H, W),
|
||||
where N is the total number of detections in a batch,
|
||||
A is the number of coarse segmentations labels
|
||||
(e.g. 15 for coarse body parts + background),
|
||||
C is the number of fine segmentation labels
|
||||
(e.g. 25 for fine body parts + background),
|
||||
W is the resolution along the X axis
|
||||
H is the resolution along the Y axis
|
||||
densepose_confidences (tuple(`torch.Tensor`)): iterable containing 4 elements:
|
||||
- sigma_1 (:obj: `torch.Tensor`): global confidences for UV coordinates
|
||||
of size (N, C, H, W)
|
||||
- sigma_2 (:obj: `torch.Tensor`): individual confidences for UV coordinates
|
||||
of size (N, C, H, W)
|
||||
- kappa_u (:obj: `torch.Tensor`): first component of confidence direction
|
||||
vector of size (N, C, H, W)
|
||||
- kappa_v (:obj: `torch.Tensor`): second component of confidence direction
|
||||
vector of size (N, C, H, W)
|
||||
- fine_segm_confidence (:obj: `torch.Tensor`): confidence for fine
|
||||
segmentation of size (N, 1, H, W)
|
||||
- coarse_segm_confidence (:obj: `torch.Tensor`): confidence for coarse
|
||||
segmentation of size (N, 1, H, W)
|
||||
detections (list[Instances]): A list of N Instances, where N is the number of images
|
||||
in the batch. Instances are modified by this method: "pred_densepose" attribute
|
||||
is added to each instance, the attribute contains the corresponding
|
||||
DensePoseOutput object.
|
||||
"""
|
||||
# DensePose outputs: segmentation, body part indices, U, V
|
||||
s, index_uv, u, v = densepose_outputs
|
||||
(
|
||||
sigma_1,
|
||||
sigma_2,
|
||||
kappa_u,
|
||||
kappa_v,
|
||||
fine_segm_confidence,
|
||||
coarse_segm_confidence,
|
||||
) = densepose_confidences
|
||||
k = 0
|
||||
for detection in detections:
|
||||
n_i = len(detection)
|
||||
s_i = s[k : k + n_i]
|
||||
index_uv_i = index_uv[k : k + n_i]
|
||||
u_i = u[k : k + n_i]
|
||||
v_i = v[k : k + n_i]
|
||||
_local_vars = locals()
|
||||
confidences = {
|
||||
name: _local_vars[name][k : k + n_i]
|
||||
for name in (
|
||||
"sigma_1",
|
||||
"sigma_2",
|
||||
"kappa_u",
|
||||
"kappa_v",
|
||||
"fine_segm_confidence",
|
||||
"coarse_segm_confidence",
|
||||
)
|
||||
if _local_vars.get(name) is not None
|
||||
}
|
||||
densepose_output_i = DensePoseOutput(s_i, index_uv_i, u_i, v_i, confidences)
|
||||
detection.pred_densepose = densepose_output_i
|
||||
k += n_i
|
|
@ -0,0 +1,3 @@
|
|||
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
|
||||
|
||||
from .densepose_losses import DensePoseLosses
|
|
@ -0,0 +1,729 @@
|
|||
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
|
||||
import math
|
||||
from dataclasses import dataclass
|
||||
from typing import Iterable, Optional
|
||||
import torch
|
||||
from torch import nn
|
||||
from torch.nn import functional as F
|
||||
|
||||
from detectron2.structures import Instances
|
||||
|
||||
from .. import DensePoseConfidenceModelConfig, DensePoseUVConfidenceType
|
||||
|
||||
|
||||
def _linear_interpolation_utilities(v_norm, v0_src, size_src, v0_dst, size_dst, size_z):
|
||||
"""
|
||||
Computes utility values for linear interpolation at points v.
|
||||
The points are given as normalized offsets in the source interval
|
||||
(v0_src, v0_src + size_src), more precisely:
|
||||
v = v0_src + v_norm * size_src / 256.0
|
||||
The computed utilities include lower points v_lo, upper points v_hi,
|
||||
interpolation weights v_w and flags j_valid indicating whether the
|
||||
points falls into the destination interval (v0_dst, v0_dst + size_dst).
|
||||
|
||||
Args:
|
||||
v_norm (:obj: `torch.Tensor`): tensor of size N containing
|
||||
normalized point offsets
|
||||
v0_src (:obj: `torch.Tensor`): tensor of size N containing
|
||||
left bounds of source intervals for normalized points
|
||||
size_src (:obj: `torch.Tensor`): tensor of size N containing
|
||||
source interval sizes for normalized points
|
||||
v0_dst (:obj: `torch.Tensor`): tensor of size N containing
|
||||
left bounds of destination intervals
|
||||
size_dst (:obj: `torch.Tensor`): tensor of size N containing
|
||||
destination interval sizes
|
||||
size_z (int): interval size for data to be interpolated
|
||||
|
||||
Returns:
|
||||
v_lo (:obj: `torch.Tensor`): int tensor of size N containing
|
||||
indices of lower values used for interpolation, all values are
|
||||
integers from [0, size_z - 1]
|
||||
v_hi (:obj: `torch.Tensor`): int tensor of size N containing
|
||||
indices of upper values used for interpolation, all values are
|
||||
integers from [0, size_z - 1]
|
||||
v_w (:obj: `torch.Tensor`): float tensor of size N containing
|
||||
interpolation weights
|
||||
j_valid (:obj: `torch.Tensor`): uint8 tensor of size N containing
|
||||
0 for points outside the estimation interval
|
||||
(v0_est, v0_est + size_est) and 1 otherwise
|
||||
"""
|
||||
v = v0_src + v_norm * size_src / 256.0
|
||||
j_valid = (v - v0_dst >= 0) * (v - v0_dst < size_dst)
|
||||
v_grid = (v - v0_dst) * size_z / size_dst
|
||||
v_lo = v_grid.floor().long().clamp(min=0, max=size_z - 1)
|
||||
v_hi = (v_lo + 1).clamp(max=size_z - 1)
|
||||
v_grid = torch.min(v_hi.float(), v_grid)
|
||||
v_w = v_grid - v_lo.float()
|
||||
return v_lo, v_hi, v_w, j_valid
|
||||
|
||||
|
||||
class SingleTensorsHelper:
|
||||
def __init__(self, proposals_with_gt):
|
||||
|
||||
with torch.no_grad():
|
||||
(
|
||||
index_uv_img,
|
||||
i_with_dp,
|
||||
bbox_xywh_est,
|
||||
bbox_xywh_gt,
|
||||
index_gt_all,
|
||||
x_norm,
|
||||
y_norm,
|
||||
u_gt_all,
|
||||
v_gt_all,
|
||||
s_gt,
|
||||
index_bbox,
|
||||
) = _extract_single_tensors_from_matches(proposals_with_gt)
|
||||
|
||||
for k, v in locals().items():
|
||||
if k not in ["self", "proposals_with_gt"]:
|
||||
setattr(self, k, v)
|
||||
|
||||
|
||||
class BilinearInterpolationHelper:
|
||||
"""
|
||||
Args:
|
||||
tensors_helper (SingleTensorsHelper)
|
||||
j_valid (:obj: `torch.Tensor`): uint8 tensor of size M containing
|
||||
0 for points to be discarded and 1 for points to be selected
|
||||
y_lo (:obj: `torch.Tensor`): int tensor of indices of upper values
|
||||
in z_est for each point
|
||||
y_hi (:obj: `torch.Tensor`): int tensor of indices of lower values
|
||||
in z_est for each point
|
||||
x_lo (:obj: `torch.Tensor`): int tensor of indices of left values
|
||||
in z_est for each point
|
||||
x_hi (:obj: `torch.Tensor`): int tensor of indices of right values
|
||||
in z_est for each point
|
||||
w_ylo_xlo (:obj: `torch.Tensor`): float tensor of size M;
|
||||
contains upper-left value weight for each point
|
||||
w_ylo_xhi (:obj: `torch.Tensor`): float tensor of size M;
|
||||
contains upper-right value weight for each point
|
||||
w_yhi_xlo (:obj: `torch.Tensor`): float tensor of size M;
|
||||
contains lower-left value weight for each point
|
||||
w_yhi_xhi (:obj: `torch.Tensor`): float tensor of size M;
|
||||
contains lower-right value weight for each point
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
tensors_helper,
|
||||
j_valid,
|
||||
y_lo,
|
||||
y_hi,
|
||||
x_lo,
|
||||
x_hi,
|
||||
w_ylo_xlo,
|
||||
w_ylo_xhi,
|
||||
w_yhi_xlo,
|
||||
w_yhi_xhi,
|
||||
):
|
||||
for k, v in locals().items():
|
||||
if k != "self":
|
||||
setattr(self, k, v)
|
||||
|
||||
@staticmethod
|
||||
def from_matches(tensors_helper, densepose_outputs_size):
|
||||
|
||||
zh, zw = densepose_outputs_size[2], densepose_outputs_size[3]
|
||||
|
||||
x0_gt, y0_gt, w_gt, h_gt = tensors_helper.bbox_xywh_gt[tensors_helper.index_bbox].unbind(1)
|
||||
x0_est, y0_est, w_est, h_est = tensors_helper.bbox_xywh_est[
|
||||
tensors_helper.index_bbox
|
||||
].unbind(dim=1)
|
||||
x_lo, x_hi, x_w, jx_valid = _linear_interpolation_utilities(
|
||||
tensors_helper.x_norm, x0_gt, w_gt, x0_est, w_est, zw
|
||||
)
|
||||
y_lo, y_hi, y_w, jy_valid = _linear_interpolation_utilities(
|
||||
tensors_helper.y_norm, y0_gt, h_gt, y0_est, h_est, zh
|
||||
)
|
||||
j_valid = jx_valid * jy_valid
|
||||
|
||||
w_ylo_xlo = (1.0 - x_w) * (1.0 - y_w)
|
||||
w_ylo_xhi = x_w * (1.0 - y_w)
|
||||
w_yhi_xlo = (1.0 - x_w) * y_w
|
||||
w_yhi_xhi = x_w * y_w
|
||||
|
||||
return BilinearInterpolationHelper(
|
||||
tensors_helper,
|
||||
j_valid,
|
||||
y_lo,
|
||||
y_hi,
|
||||
x_lo,
|
||||
x_hi,
|
||||
w_ylo_xlo,
|
||||
w_ylo_xhi,
|
||||
w_yhi_xlo,
|
||||
w_yhi_xhi,
|
||||
)
|
||||
|
||||
def extract_at_points(
|
||||
self,
|
||||
z_est,
|
||||
slice_index_uv=None,
|
||||
w_ylo_xlo=None,
|
||||
w_ylo_xhi=None,
|
||||
w_yhi_xlo=None,
|
||||
w_yhi_xhi=None,
|
||||
):
|
||||
"""
|
||||
Extract ground truth values z_gt for valid point indices and estimated
|
||||
values z_est using bilinear interpolation over top-left (y_lo, x_lo),
|
||||
top-right (y_lo, x_hi), bottom-left (y_hi, x_lo) and bottom-right
|
||||
(y_hi, x_hi) values in z_est with corresponding weights:
|
||||
w_ylo_xlo, w_ylo_xhi, w_yhi_xlo and w_yhi_xhi.
|
||||
Use slice_index_uv to slice dim=1 in z_est
|
||||
"""
|
||||
index_gt_all = self.tensors_helper.index_gt_all
|
||||
slice_index_uv = index_gt_all if slice_index_uv is None else slice_index_uv
|
||||
w_ylo_xlo = self.w_ylo_xlo if w_ylo_xlo is None else w_ylo_xlo
|
||||
w_ylo_xhi = self.w_ylo_xhi if w_ylo_xhi is None else w_ylo_xhi
|
||||
w_yhi_xlo = self.w_yhi_xlo if w_yhi_xlo is None else w_yhi_xlo
|
||||
w_yhi_xhi = self.w_yhi_xhi if w_yhi_xhi is None else w_yhi_xhi
|
||||
|
||||
index_bbox = self.tensors_helper.index_bbox
|
||||
z_est_sampled = (
|
||||
z_est[index_bbox, slice_index_uv, self.y_lo, self.x_lo] * w_ylo_xlo
|
||||
+ z_est[index_bbox, slice_index_uv, self.y_lo, self.x_hi] * w_ylo_xhi
|
||||
+ z_est[index_bbox, slice_index_uv, self.y_hi, self.x_lo] * w_yhi_xlo
|
||||
+ z_est[index_bbox, slice_index_uv, self.y_hi, self.x_hi] * w_yhi_xhi
|
||||
)
|
||||
return z_est_sampled
|
||||
|
||||
|
||||
def _resample_data(
|
||||
z, bbox_xywh_src, bbox_xywh_dst, wout, hout, mode="nearest", padding_mode="zeros"
|
||||
):
|
||||
"""
|
||||
Args:
|
||||
z (:obj: `torch.Tensor`): tensor of size (N,C,H,W) with data to be
|
||||
resampled
|
||||
bbox_xywh_src (:obj: `torch.Tensor`): tensor of size (N,4) containing
|
||||
source bounding boxes in format XYWH
|
||||
bbox_xywh_dst (:obj: `torch.Tensor`): tensor of size (N,4) containing
|
||||
destination bounding boxes in format XYWH
|
||||
Return:
|
||||
zresampled (:obj: `torch.Tensor`): tensor of size (N, C, Hout, Wout)
|
||||
with resampled values of z, where D is the discretization size
|
||||
"""
|
||||
n = bbox_xywh_src.size(0)
|
||||
assert n == bbox_xywh_dst.size(0), (
|
||||
"The number of "
|
||||
"source ROIs for resampling ({}) should be equal to the number "
|
||||
"of destination ROIs ({})".format(bbox_xywh_src.size(0), bbox_xywh_dst.size(0))
|
||||
)
|
||||
x0src, y0src, wsrc, hsrc = bbox_xywh_src.unbind(dim=1)
|
||||
x0dst, y0dst, wdst, hdst = bbox_xywh_dst.unbind(dim=1)
|
||||
x0dst_norm = 2 * (x0dst - x0src) / wsrc - 1
|
||||
y0dst_norm = 2 * (y0dst - y0src) / hsrc - 1
|
||||
x1dst_norm = 2 * (x0dst + wdst - x0src) / wsrc - 1
|
||||
y1dst_norm = 2 * (y0dst + hdst - y0src) / hsrc - 1
|
||||
grid_w = torch.arange(wout, device=z.device, dtype=torch.float) / wout
|
||||
grid_h = torch.arange(hout, device=z.device, dtype=torch.float) / hout
|
||||
grid_w_expanded = grid_w[None, None, :].expand(n, hout, wout)
|
||||
grid_h_expanded = grid_h[None, :, None].expand(n, hout, wout)
|
||||
dx_expanded = (x1dst_norm - x0dst_norm)[:, None, None].expand(n, hout, wout)
|
||||
dy_expanded = (y1dst_norm - y0dst_norm)[:, None, None].expand(n, hout, wout)
|
||||
x0_expanded = x0dst_norm[:, None, None].expand(n, hout, wout)
|
||||
y0_expanded = y0dst_norm[:, None, None].expand(n, hout, wout)
|
||||
grid_x = grid_w_expanded * dx_expanded + x0_expanded
|
||||
grid_y = grid_h_expanded * dy_expanded + y0_expanded
|
||||
grid = torch.stack((grid_x, grid_y), dim=3)
|
||||
# resample Z from (N, C, H, W) into (N, C, Hout, Wout)
|
||||
zresampled = F.grid_sample(z, grid, mode=mode, padding_mode=padding_mode, align_corners=True)
|
||||
return zresampled
|
||||
|
||||
|
||||
def _extract_single_tensors_from_matches_one_image(
|
||||
proposals_targets, bbox_with_dp_offset, bbox_global_offset
|
||||
):
|
||||
i_gt_all = []
|
||||
x_norm_all = []
|
||||
y_norm_all = []
|
||||
u_gt_all = []
|
||||
v_gt_all = []
|
||||
s_gt_all = []
|
||||
bbox_xywh_gt_all = []
|
||||
bbox_xywh_est_all = []
|
||||
# Ibbox_all == k should be true for all data that corresponds
|
||||
# to bbox_xywh_gt[k] and bbox_xywh_est[k]
|
||||
# index k here is global wrt images
|
||||
i_bbox_all = []
|
||||
# at offset k (k is global) contains index of bounding box data
|
||||
# within densepose output tensor
|
||||
i_with_dp = []
|
||||
|
||||
boxes_xywh_est = proposals_targets.proposal_boxes.clone()
|
||||
boxes_xywh_gt = proposals_targets.gt_boxes.clone()
|
||||
n_i = len(boxes_xywh_est)
|
||||
assert n_i == len(boxes_xywh_gt)
|
||||
|
||||
if n_i:
|
||||
boxes_xywh_est.tensor[:, 2] -= boxes_xywh_est.tensor[:, 0]
|
||||
boxes_xywh_est.tensor[:, 3] -= boxes_xywh_est.tensor[:, 1]
|
||||
boxes_xywh_gt.tensor[:, 2] -= boxes_xywh_gt.tensor[:, 0]
|
||||
boxes_xywh_gt.tensor[:, 3] -= boxes_xywh_gt.tensor[:, 1]
|
||||
if hasattr(proposals_targets, "gt_densepose"):
|
||||
densepose_gt = proposals_targets.gt_densepose
|
||||
for k, box_xywh_est, box_xywh_gt, dp_gt in zip(
|
||||
range(n_i), boxes_xywh_est.tensor, boxes_xywh_gt.tensor, densepose_gt
|
||||
):
|
||||
if (dp_gt is not None) and (len(dp_gt.x) > 0):
|
||||
i_gt_all.append(dp_gt.i)
|
||||
x_norm_all.append(dp_gt.x)
|
||||
y_norm_all.append(dp_gt.y)
|
||||
u_gt_all.append(dp_gt.u)
|
||||
v_gt_all.append(dp_gt.v)
|
||||
s_gt_all.append(dp_gt.segm.unsqueeze(0))
|
||||
bbox_xywh_gt_all.append(box_xywh_gt.view(-1, 4))
|
||||
bbox_xywh_est_all.append(box_xywh_est.view(-1, 4))
|
||||
i_bbox_k = torch.full_like(dp_gt.i, bbox_with_dp_offset + len(i_with_dp))
|
||||
i_bbox_all.append(i_bbox_k)
|
||||
i_with_dp.append(bbox_global_offset + k)
|
||||
return (
|
||||
i_gt_all,
|
||||
x_norm_all,
|
||||
y_norm_all,
|
||||
u_gt_all,
|
||||
v_gt_all,
|
||||
s_gt_all,
|
||||
bbox_xywh_gt_all,
|
||||
bbox_xywh_est_all,
|
||||
i_bbox_all,
|
||||
i_with_dp,
|
||||
)
|
||||
|
||||
|
||||
def _extract_single_tensors_from_matches(proposals_with_targets):
|
||||
i_img = []
|
||||
i_gt_all = []
|
||||
x_norm_all = []
|
||||
y_norm_all = []
|
||||
u_gt_all = []
|
||||
v_gt_all = []
|
||||
s_gt_all = []
|
||||
bbox_xywh_gt_all = []
|
||||
bbox_xywh_est_all = []
|
||||
i_bbox_all = []
|
||||
i_with_dp_all = []
|
||||
n = 0
|
||||
for i, proposals_targets_per_image in enumerate(proposals_with_targets):
|
||||
n_i = proposals_targets_per_image.proposal_boxes.tensor.size(0)
|
||||
if not n_i:
|
||||
continue
|
||||
(
|
||||
i_gt_img,
|
||||
x_norm_img,
|
||||
y_norm_img,
|
||||
u_gt_img,
|
||||
v_gt_img,
|
||||
s_gt_img,
|
||||
bbox_xywh_gt_img,
|
||||
bbox_xywh_est_img,
|
||||
i_bbox_img,
|
||||
i_with_dp_img,
|
||||
) = _extract_single_tensors_from_matches_one_image( # noqa
|
||||
proposals_targets_per_image, len(i_with_dp_all), n
|
||||
)
|
||||
i_gt_all.extend(i_gt_img)
|
||||
x_norm_all.extend(x_norm_img)
|
||||
y_norm_all.extend(y_norm_img)
|
||||
u_gt_all.extend(u_gt_img)
|
||||
v_gt_all.extend(v_gt_img)
|
||||
s_gt_all.extend(s_gt_img)
|
||||
bbox_xywh_gt_all.extend(bbox_xywh_gt_img)
|
||||
bbox_xywh_est_all.extend(bbox_xywh_est_img)
|
||||
i_bbox_all.extend(i_bbox_img)
|
||||
i_with_dp_all.extend(i_with_dp_img)
|
||||
i_img.extend([i] * len(i_with_dp_img))
|
||||
n += n_i
|
||||
# concatenate all data into a single tensor
|
||||
if (n > 0) and (len(i_with_dp_all) > 0):
|
||||
i_gt = torch.cat(i_gt_all, 0).long()
|
||||
x_norm = torch.cat(x_norm_all, 0)
|
||||
y_norm = torch.cat(y_norm_all, 0)
|
||||
u_gt = torch.cat(u_gt_all, 0)
|
||||
v_gt = torch.cat(v_gt_all, 0)
|
||||
s_gt = torch.cat(s_gt_all, 0)
|
||||
bbox_xywh_gt = torch.cat(bbox_xywh_gt_all, 0)
|
||||
bbox_xywh_est = torch.cat(bbox_xywh_est_all, 0)
|
||||
i_bbox = torch.cat(i_bbox_all, 0).long()
|
||||
else:
|
||||
i_gt = None
|
||||
x_norm = None
|
||||
y_norm = None
|
||||
u_gt = None
|
||||
v_gt = None
|
||||
s_gt = None
|
||||
bbox_xywh_gt = None
|
||||
bbox_xywh_est = None
|
||||
i_bbox = None
|
||||
return (
|
||||
i_img,
|
||||
i_with_dp_all,
|
||||
bbox_xywh_est,
|
||||
bbox_xywh_gt,
|
||||
i_gt,
|
||||
x_norm,
|
||||
y_norm,
|
||||
u_gt,
|
||||
v_gt,
|
||||
s_gt,
|
||||
i_bbox,
|
||||
)
|
||||
|
||||
|
||||
@dataclass
|
||||
class DataForMaskLoss:
|
||||
"""
|
||||
Contains mask GT and estimated data for proposals from multiple images:
|
||||
"""
|
||||
|
||||
# tensor of size (K, H, W) containing GT labels
|
||||
masks_gt: Optional[torch.Tensor] = None
|
||||
# tensor of size (K, C, H, W) containing estimated scores
|
||||
masks_est: Optional[torch.Tensor] = None
|
||||
|
||||
|
||||
def _extract_data_for_mask_loss_from_matches(
|
||||
proposals_targets: Iterable[Instances], estimated_segm: torch.Tensor
|
||||
) -> DataForMaskLoss:
|
||||
"""
|
||||
Extract data for mask loss from instances that contain matched GT and
|
||||
estimated bounding boxes.
|
||||
Args:
|
||||
proposals_targets: Iterable[Instances]
|
||||
matched GT and estimated results, each item in the iterable
|
||||
corresponds to data in 1 image
|
||||
estimated_segm: torch.Tensor if size
|
||||
size to which GT masks are resized
|
||||
Return:
|
||||
masks_est: tensor(K, C, H, W) of float - class scores
|
||||
masks_gt: tensor(K, H, W) of int64 - labels
|
||||
"""
|
||||
data = DataForMaskLoss()
|
||||
masks_gt = []
|
||||
offset = 0
|
||||
assert estimated_segm.shape[2] == estimated_segm.shape[3], (
|
||||
f"Expected estimated segmentation to have a square shape, "
|
||||
f"but the actual shape is {estimated_segm.shape[2:]}"
|
||||
)
|
||||
mask_size = estimated_segm.shape[2]
|
||||
num_proposals = sum(inst.proposal_boxes.tensor.size(0) for inst in proposals_targets)
|
||||
num_estimated = estimated_segm.shape[0]
|
||||
assert (
|
||||
num_proposals == num_estimated
|
||||
), "The number of proposals {} must be equal to the number of estimates {}".format(
|
||||
num_proposals, num_estimated
|
||||
)
|
||||
|
||||
for proposals_targets_per_image in proposals_targets:
|
||||
n_i = proposals_targets_per_image.proposal_boxes.tensor.size(0)
|
||||
if not n_i:
|
||||
continue
|
||||
gt_masks_per_image = proposals_targets_per_image.gt_masks.crop_and_resize(
|
||||
proposals_targets_per_image.proposal_boxes.tensor, mask_size
|
||||
).to(device=estimated_segm.device)
|
||||
masks_gt.append(gt_masks_per_image)
|
||||
offset += n_i
|
||||
if masks_gt:
|
||||
data.masks_est = estimated_segm
|
||||
data.masks_gt = torch.cat(masks_gt, dim=0)
|
||||
return data
|
||||
|
||||
|
||||
class IIDIsotropicGaussianUVLoss(nn.Module):
|
||||
"""
|
||||
Loss for the case of iid residuals with isotropic covariance:
|
||||
$Sigma_i = sigma_i^2 I$
|
||||
The loss (negative log likelihood) is then:
|
||||
$1/2 sum_{i=1}^n (log(2 pi) + 2 log sigma_i^2 + ||delta_i||^2 / sigma_i^2)$,
|
||||
where $delta_i=(u - u', v - v')$ is a 2D vector containing UV coordinates
|
||||
difference between estimated and ground truth UV values
|
||||
For details, see:
|
||||
N. Neverova, D. Novotny, A. Vedaldi "Correlated Uncertainty for Learning
|
||||
Dense Correspondences from Noisy Labels", p. 918--926, in Proc. NIPS 2019
|
||||
"""
|
||||
|
||||
def __init__(self, sigma_lower_bound: float):
|
||||
super(IIDIsotropicGaussianUVLoss, self).__init__()
|
||||
self.sigma_lower_bound = sigma_lower_bound
|
||||
self.log2pi = math.log(2 * math.pi)
|
||||
|
||||
def forward(
|
||||
self,
|
||||
u: torch.Tensor,
|
||||
v: torch.Tensor,
|
||||
sigma_u: torch.Tensor,
|
||||
target_u: torch.Tensor,
|
||||
target_v: torch.Tensor,
|
||||
):
|
||||
# compute $\sigma_i^2$
|
||||
# use sigma_lower_bound to avoid degenerate solution for variance
|
||||
# (sigma -> 0)
|
||||
sigma2 = F.softplus(sigma_u) + self.sigma_lower_bound
|
||||
# compute \|delta_i\|^2
|
||||
delta_t_delta = (u - target_u) ** 2 + (v - target_v) ** 2
|
||||
# the total loss from the formula above:
|
||||
loss = 0.5 * (self.log2pi + 2 * torch.log(sigma2) + delta_t_delta / sigma2)
|
||||
return loss.sum()
|
||||
|
||||
|
||||
class IndepAnisotropicGaussianUVLoss(nn.Module):
|
||||
"""
|
||||
Loss for the case of independent residuals with anisotropic covariances:
|
||||
$Sigma_i = sigma_i^2 I + r_i r_i^T$
|
||||
The loss (negative log likelihood) is then:
|
||||
$1/2 sum_{i=1}^n (log(2 pi)
|
||||
+ log sigma_i^2 (sigma_i^2 + ||r_i||^2)
|
||||
+ ||delta_i||^2 / sigma_i^2
|
||||
- <delta_i, r_i>^2 / (sigma_i^2 * (sigma_i^2 + ||r_i||^2)))$,
|
||||
where $delta_i=(u - u', v - v')$ is a 2D vector containing UV coordinates
|
||||
difference between estimated and ground truth UV values
|
||||
For details, see:
|
||||
N. Neverova, D. Novotny, A. Vedaldi "Correlated Uncertainty for Learning
|
||||
Dense Correspondences from Noisy Labels", p. 918--926, in Proc. NIPS 2019
|
||||
"""
|
||||
|
||||
def __init__(self, sigma_lower_bound: float):
|
||||
super(IndepAnisotropicGaussianUVLoss, self).__init__()
|
||||
self.sigma_lower_bound = sigma_lower_bound
|
||||
self.log2pi = math.log(2 * math.pi)
|
||||
|
||||
def forward(
|
||||
self,
|
||||
u: torch.Tensor,
|
||||
v: torch.Tensor,
|
||||
sigma_u: torch.Tensor,
|
||||
kappa_u_est: torch.Tensor,
|
||||
kappa_v_est: torch.Tensor,
|
||||
target_u: torch.Tensor,
|
||||
target_v: torch.Tensor,
|
||||
):
|
||||
# compute $\sigma_i^2$
|
||||
sigma2 = F.softplus(sigma_u) + self.sigma_lower_bound
|
||||
# compute \|r_i\|^2
|
||||
r_sqnorm2 = kappa_u_est ** 2 + kappa_v_est ** 2
|
||||
delta_u = u - target_u
|
||||
delta_v = v - target_v
|
||||
# compute \|delta_i\|^2
|
||||
delta_sqnorm = delta_u ** 2 + delta_v ** 2
|
||||
delta_u_r_u = delta_u * kappa_u_est
|
||||
delta_v_r_v = delta_v * kappa_v_est
|
||||
# compute the scalar product <delta_i, r_i>
|
||||
delta_r = delta_u_r_u + delta_v_r_v
|
||||
# compute squared scalar product <delta_i, r_i>^2
|
||||
delta_r_sqnorm = delta_r ** 2
|
||||
denom2 = sigma2 * (sigma2 + r_sqnorm2)
|
||||
loss = 0.5 * (
|
||||
self.log2pi + torch.log(denom2) + delta_sqnorm / sigma2 - delta_r_sqnorm / denom2
|
||||
)
|
||||
return loss.sum()
|
||||
|
||||
|
||||
class DensePoseLosses(object):
|
||||
def __init__(self, cfg):
|
||||
# fmt: off
|
||||
self.heatmap_size = cfg.MODEL.ROI_DENSEPOSE_HEAD.HEATMAP_SIZE
|
||||
self.w_points = cfg.MODEL.ROI_DENSEPOSE_HEAD.POINT_REGRESSION_WEIGHTS
|
||||
self.w_part = cfg.MODEL.ROI_DENSEPOSE_HEAD.PART_WEIGHTS
|
||||
self.w_segm = cfg.MODEL.ROI_DENSEPOSE_HEAD.INDEX_WEIGHTS
|
||||
self.n_segm_chan = cfg.MODEL.ROI_DENSEPOSE_HEAD.NUM_COARSE_SEGM_CHANNELS
|
||||
# fmt: on
|
||||
self.segm_trained_by_masks = cfg.MODEL.ROI_DENSEPOSE_HEAD.COARSE_SEGM_TRAINED_BY_MASKS
|
||||
self.confidence_model_cfg = DensePoseConfidenceModelConfig.from_cfg(cfg)
|
||||
if self.confidence_model_cfg.uv_confidence.type == DensePoseUVConfidenceType.IID_ISO:
|
||||
self.uv_loss_with_confidences = IIDIsotropicGaussianUVLoss(
|
||||
self.confidence_model_cfg.uv_confidence.epsilon
|
||||
)
|
||||
elif self.confidence_model_cfg.uv_confidence.type == DensePoseUVConfidenceType.INDEP_ANISO:
|
||||
self.uv_loss_with_confidences = IndepAnisotropicGaussianUVLoss(
|
||||
self.confidence_model_cfg.uv_confidence.epsilon
|
||||
)
|
||||
|
||||
def __call__(self, proposals_with_gt, densepose_outputs, densepose_confidences):
|
||||
if not self.segm_trained_by_masks:
|
||||
return self.produce_densepose_losses(
|
||||
proposals_with_gt, densepose_outputs, densepose_confidences
|
||||
)
|
||||
else:
|
||||
losses = {}
|
||||
losses_densepose = self.produce_densepose_losses(
|
||||
proposals_with_gt, densepose_outputs, densepose_confidences
|
||||
)
|
||||
losses.update(losses_densepose)
|
||||
losses_mask = self.produce_mask_losses(
|
||||
proposals_with_gt, densepose_outputs, densepose_confidences
|
||||
)
|
||||
losses.update(losses_mask)
|
||||
return losses
|
||||
|
||||
def produce_fake_mask_losses(self, densepose_outputs):
|
||||
losses = {}
|
||||
segm_scores, _, _, _ = densepose_outputs
|
||||
losses["loss_densepose_S"] = segm_scores.sum() * 0
|
||||
return losses
|
||||
|
||||
def produce_mask_losses(self, proposals_with_gt, densepose_outputs, densepose_confidences):
|
||||
if not len(proposals_with_gt):
|
||||
return self.produce_fake_mask_losses(densepose_outputs)
|
||||
losses = {}
|
||||
# densepose outputs are computed for all images and all bounding boxes;
|
||||
# i.e. if a batch has 4 images with (3, 1, 2, 1) proposals respectively,
|
||||
# the outputs will have size(0) == 3+1+2+1 == 7
|
||||
segm_scores, _, _, _ = densepose_outputs
|
||||
with torch.no_grad():
|
||||
mask_loss_data = _extract_data_for_mask_loss_from_matches(
|
||||
proposals_with_gt, segm_scores
|
||||
)
|
||||
if (mask_loss_data.masks_gt is None) or (mask_loss_data.masks_est is None):
|
||||
return self.produce_fake_mask_losses(densepose_outputs)
|
||||
losses["loss_densepose_S"] = (
|
||||
F.cross_entropy(mask_loss_data.masks_est, mask_loss_data.masks_gt.long()) * self.w_segm
|
||||
)
|
||||
return losses
|
||||
|
||||
def produce_fake_densepose_losses(self, densepose_outputs, densepose_confidences):
|
||||
# we need to keep the same computation graph on all the GPUs to
|
||||
# perform reduction properly. Hence even if we have no data on one
|
||||
# of the GPUs, we still need to generate the computation graph.
|
||||
# Add fake (zero) losses in the form Tensor.sum() * 0
|
||||
s, index_uv, u, v = densepose_outputs
|
||||
conf_type = self.confidence_model_cfg.uv_confidence.type
|
||||
(
|
||||
sigma_1,
|
||||
sigma_2,
|
||||
kappa_u,
|
||||
kappa_v,
|
||||
fine_segm_confidence,
|
||||
coarse_segm_confidence,
|
||||
) = densepose_confidences
|
||||
losses = {}
|
||||
losses["loss_densepose_I"] = index_uv.sum() * 0
|
||||
if not self.segm_trained_by_masks:
|
||||
losses["loss_densepose_S"] = s.sum() * 0
|
||||
if self.confidence_model_cfg.uv_confidence.enabled:
|
||||
losses["loss_densepose_UV"] = (u.sum() + v.sum()) * 0
|
||||
if conf_type == DensePoseUVConfidenceType.IID_ISO:
|
||||
losses["loss_densepose_UV"] += sigma_2.sum() * 0
|
||||
elif conf_type == DensePoseUVConfidenceType.INDEP_ANISO:
|
||||
losses["loss_densepose_UV"] += (sigma_2.sum() + kappa_u.sum() + kappa_v.sum()) * 0
|
||||
else:
|
||||
losses["loss_densepose_U"] = u.sum() * 0
|
||||
losses["loss_densepose_V"] = v.sum() * 0
|
||||
return losses
|
||||
|
||||
def produce_densepose_losses(self, proposals_with_gt, densepose_outputs, densepose_confidences):
|
||||
losses = {}
|
||||
# densepose outputs are computed for all images and all bounding boxes;
|
||||
# i.e. if a batch has 4 images with (3, 1, 2, 1) proposals respectively,
|
||||
# the outputs will have size(0) == 3+1+2+1 == 7
|
||||
s, index_uv, u, v = densepose_outputs
|
||||
assert u.size(2) == v.size(2)
|
||||
assert u.size(3) == v.size(3)
|
||||
assert u.size(2) == index_uv.size(2)
|
||||
assert u.size(3) == index_uv.size(3)
|
||||
densepose_outputs_size = u.size()
|
||||
|
||||
if not len(proposals_with_gt):
|
||||
return self.produce_fake_densepose_losses(densepose_outputs, densepose_confidences)
|
||||
(
|
||||
sigma_1,
|
||||
sigma_2,
|
||||
kappa_u,
|
||||
kappa_v,
|
||||
fine_segm_confidence,
|
||||
coarse_segm_confidence,
|
||||
) = densepose_confidences
|
||||
conf_type = self.confidence_model_cfg.uv_confidence.type
|
||||
|
||||
tensors_helper = SingleTensorsHelper(proposals_with_gt)
|
||||
n_batch = len(tensors_helper.i_with_dp)
|
||||
|
||||
# NOTE: we need to keep the same computation graph on all the GPUs to
|
||||
# perform reduction properly. Hence even if we have no data on one
|
||||
# of the GPUs, we still need to generate the computation graph.
|
||||
# Add fake (zero) loss in the form Tensor.sum() * 0
|
||||
if not n_batch:
|
||||
return self.produce_fake_densepose_losses(densepose_outputs, densepose_confidences)
|
||||
|
||||
interpolator = BilinearInterpolationHelper.from_matches(
|
||||
tensors_helper, densepose_outputs_size
|
||||
)
|
||||
|
||||
j_valid_fg = interpolator.j_valid * (tensors_helper.index_gt_all > 0)
|
||||
|
||||
u_gt = tensors_helper.u_gt_all[j_valid_fg]
|
||||
u_est_all = interpolator.extract_at_points(u[tensors_helper.i_with_dp])
|
||||
u_est = u_est_all[j_valid_fg]
|
||||
|
||||
v_gt = tensors_helper.v_gt_all[j_valid_fg]
|
||||
v_est_all = interpolator.extract_at_points(v[tensors_helper.i_with_dp])
|
||||
v_est = v_est_all[j_valid_fg]
|
||||
|
||||
index_uv_gt = tensors_helper.index_gt_all[interpolator.j_valid]
|
||||
index_uv_est_all = interpolator.extract_at_points(
|
||||
index_uv[tensors_helper.i_with_dp],
|
||||
slice_index_uv=slice(None),
|
||||
w_ylo_xlo=interpolator.w_ylo_xlo[:, None],
|
||||
w_ylo_xhi=interpolator.w_ylo_xhi[:, None],
|
||||
w_yhi_xlo=interpolator.w_yhi_xlo[:, None],
|
||||
w_yhi_xhi=interpolator.w_yhi_xhi[:, None],
|
||||
)
|
||||
index_uv_est = index_uv_est_all[interpolator.j_valid, :]
|
||||
|
||||
if self.confidence_model_cfg.uv_confidence.enabled:
|
||||
sigma_2_est_all = interpolator.extract_at_points(sigma_2[tensors_helper.i_with_dp])
|
||||
sigma_2_est = sigma_2_est_all[j_valid_fg]
|
||||
if conf_type in [DensePoseUVConfidenceType.INDEP_ANISO]:
|
||||
kappa_u_est_all = interpolator.extract_at_points(kappa_u[tensors_helper.i_with_dp])
|
||||
kappa_u_est = kappa_u_est_all[j_valid_fg]
|
||||
kappa_v_est_all = interpolator.extract_at_points(kappa_v[tensors_helper.i_with_dp])
|
||||
kappa_v_est = kappa_v_est_all[j_valid_fg]
|
||||
|
||||
# Resample everything to the estimated data size, no need to resample
|
||||
# S_est then:
|
||||
if not self.segm_trained_by_masks:
|
||||
s_est = s[tensors_helper.i_with_dp]
|
||||
with torch.no_grad():
|
||||
s_gt = _resample_data(
|
||||
tensors_helper.s_gt.unsqueeze(1),
|
||||
tensors_helper.bbox_xywh_gt,
|
||||
tensors_helper.bbox_xywh_est,
|
||||
self.heatmap_size,
|
||||
self.heatmap_size,
|
||||
mode="nearest",
|
||||
padding_mode="zeros",
|
||||
).squeeze(1)
|
||||
|
||||
# add point-based losses:
|
||||
if self.confidence_model_cfg.uv_confidence.enabled:
|
||||
if conf_type == DensePoseUVConfidenceType.IID_ISO:
|
||||
uv_loss = (
|
||||
self.uv_loss_with_confidences(u_est, v_est, sigma_2_est, u_gt, v_gt)
|
||||
* self.w_points
|
||||
)
|
||||
losses["loss_densepose_UV"] = uv_loss
|
||||
elif conf_type == DensePoseUVConfidenceType.INDEP_ANISO:
|
||||
uv_loss = (
|
||||
self.uv_loss_with_confidences(
|
||||
u_est, v_est, sigma_2_est, kappa_u_est, kappa_v_est, u_gt, v_gt
|
||||
)
|
||||
* self.w_points
|
||||
)
|
||||
losses["loss_densepose_UV"] = uv_loss
|
||||
else:
|
||||
raise ValueError(f"Unknown confidence model type: {conf_type}")
|
||||
else:
|
||||
u_loss = F.smooth_l1_loss(u_est, u_gt, reduction="sum") * self.w_points
|
||||
losses["loss_densepose_U"] = u_loss
|
||||
v_loss = F.smooth_l1_loss(v_est, v_gt, reduction="sum") * self.w_points
|
||||
losses["loss_densepose_V"] = v_loss
|
||||
index_uv_loss = F.cross_entropy(index_uv_est, index_uv_gt.long()) * self.w_part
|
||||
losses["loss_densepose_I"] = index_uv_loss
|
||||
|
||||
if not self.segm_trained_by_masks:
|
||||
if self.n_segm_chan == 2:
|
||||
s_gt = s_gt > 0
|
||||
s_loss = F.cross_entropy(s_est, s_gt.long()) * self.w_segm
|
||||
losses["loss_densepose_S"] = s_loss
|
||||
return losses
|
|
@ -0,0 +1,5 @@
|
|||
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
|
||||
|
||||
from .chart import DensePoseChartPredictor
|
||||
from .chart_confidence import DensePoseChartConfidencePredictorMixin
|
||||
from .chart_with_confidence import DensePoseChartWithConfidencePredictor
|
|
@ -0,0 +1,102 @@
|
|||
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
|
||||
|
||||
import torch
|
||||
from torch import nn
|
||||
|
||||
from detectron2.config import CfgNode
|
||||
from detectron2.layers import ConvTranspose2d, interpolate
|
||||
|
||||
from ..utils import initialize_module_params
|
||||
|
||||
|
||||
class DensePoseChartPredictor(nn.Module):
|
||||
"""
|
||||
Predictor (last layers of a DensePose model) that takes DensePose head outputs as an input
|
||||
and produces 4 tensors which represent DensePose results for predefined body parts
|
||||
(patches / charts):
|
||||
- coarse segmentation [N, K, H, W]
|
||||
- fine segmentation [N, C, H, W]
|
||||
- U coordinates [N, C, H, W]
|
||||
- V coordinates [N, C, H, W]
|
||||
where
|
||||
- N is the number of instances
|
||||
- K is the number of coarse segmentation channels (
|
||||
2 = foreground / background,
|
||||
15 = one of 14 body parts / background)
|
||||
- C is the number of fine segmentation channels (
|
||||
24 fine body parts / background)
|
||||
- H and W are height and width of predictions
|
||||
"""
|
||||
|
||||
def __init__(self, cfg: CfgNode, input_channels: int):
|
||||
"""
|
||||
Initialize predictor using configuration options
|
||||
|
||||
Args:
|
||||
cfg (CfgNode): configuration options
|
||||
input_channels (int): input tensor size along the channel dimension
|
||||
"""
|
||||
super().__init__()
|
||||
dim_in = input_channels
|
||||
n_segm_chan = cfg.MODEL.ROI_DENSEPOSE_HEAD.NUM_COARSE_SEGM_CHANNELS
|
||||
dim_out_patches = cfg.MODEL.ROI_DENSEPOSE_HEAD.NUM_PATCHES + 1
|
||||
kernel_size = cfg.MODEL.ROI_DENSEPOSE_HEAD.DECONV_KERNEL
|
||||
self.ann_index_lowres = ConvTranspose2d(
|
||||
dim_in, n_segm_chan, kernel_size, stride=2, padding=int(kernel_size / 2 - 1)
|
||||
)
|
||||
self.index_uv_lowres = ConvTranspose2d(
|
||||
dim_in, dim_out_patches, kernel_size, stride=2, padding=int(kernel_size / 2 - 1)
|
||||
)
|
||||
self.u_lowres = ConvTranspose2d(
|
||||
dim_in, dim_out_patches, kernel_size, stride=2, padding=int(kernel_size / 2 - 1)
|
||||
)
|
||||
self.v_lowres = ConvTranspose2d(
|
||||
dim_in, dim_out_patches, kernel_size, stride=2, padding=int(kernel_size / 2 - 1)
|
||||
)
|
||||
self.scale_factor = cfg.MODEL.ROI_DENSEPOSE_HEAD.UP_SCALE
|
||||
initialize_module_params(self)
|
||||
|
||||
def interp2d(self, tensor_nchw: torch.Tensor):
|
||||
"""
|
||||
Bilinear interpolation method to be used for upscaling
|
||||
|
||||
Args:
|
||||
tensor_nchw (tensor): tensor of shape (N, C, H, W)
|
||||
Return:
|
||||
tensor of shape (N, C, Hout, Wout), where Hout and Wout are computed
|
||||
by applying the scale factor to H and W
|
||||
"""
|
||||
return interpolate(
|
||||
tensor_nchw, scale_factor=self.scale_factor, mode="bilinear", align_corners=False
|
||||
)
|
||||
|
||||
def forward(self, head_outputs: torch.Tensor):
|
||||
"""
|
||||
Perform forward step on DensePose head outputs
|
||||
|
||||
Args:
|
||||
head_outputs (tensor): DensePose head outputs, tensor of shape [N, D, H, W]
|
||||
Return:
|
||||
- a tuple of 4 tensors containing DensePose predictions for charts:
|
||||
* coarse segmentation estimate, a tensor of shape [N, K, Hout, Wout]
|
||||
* fine segmentation estimate, a tensor of shape [N, C, Hout, Wout]
|
||||
* U coordinates, a tensor of shape [N, C, Hout, Wout]
|
||||
* V coordinates, a tensor of shape [N, C, Hout, Wout]
|
||||
- a tuple of 4 tensors containing DensePose predictions for charts at reduced resolution:
|
||||
* coarse segmentation estimate, a tensor of shape [N, K, Hout / 2, Wout / 2]
|
||||
* fine segmentation estimate, a tensor of shape [N, C, Hout / 2, Wout / 2]
|
||||
* U coordinates, a tensor of shape [N, C, Hout / 2, Wout / 2]
|
||||
* V coordinates, a tensor of shape [N, C, Hout / 2, Wout / 2]
|
||||
"""
|
||||
coarse_segm_lowres = self.ann_index_lowres(head_outputs)
|
||||
fine_segm_lowres = self.index_uv_lowres(head_outputs)
|
||||
u_lowres = self.u_lowres(head_outputs)
|
||||
v_lowres = self.v_lowres(head_outputs)
|
||||
|
||||
coarse_segm = self.interp2d(coarse_segm_lowres)
|
||||
fine_segm = self.interp2d(fine_segm_lowres)
|
||||
u = self.interp2d(u_lowres)
|
||||
v = self.interp2d(v_lowres)
|
||||
siuv = (coarse_segm, fine_segm, u, v)
|
||||
siuv_lowres = (coarse_segm_lowres, fine_segm_lowres, u_lowres, v_lowres)
|
||||
return siuv, siuv_lowres
|
|
@ -0,0 +1,176 @@
|
|||
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
|
||||
import torch
|
||||
from torch.nn import functional as F
|
||||
|
||||
from detectron2.config import CfgNode
|
||||
from detectron2.layers import ConvTranspose2d
|
||||
|
||||
from ..confidence import DensePoseConfidenceModelConfig, DensePoseUVConfidenceType
|
||||
from ..utils import initialize_module_params
|
||||
|
||||
|
||||
class DensePoseChartConfidencePredictorMixin:
|
||||
"""
|
||||
Predictor contains the last layers of a DensePose model that take DensePose head
|
||||
outputs as an input and produce model outputs. Confidence predictor mixin is used
|
||||
to generate confidences for segmentation and UV tensors estimated by some
|
||||
base predictor. Several assumptions need to hold for the base predictor:
|
||||
1) the `forward` method must return SIUV tuple as the first result (
|
||||
S = coarse segmentation, I = fine segmentation, U and V are intrinsic
|
||||
chart coordinates)
|
||||
2) `interp2d` method must be defined to perform bilinear interpolation;
|
||||
the same method is typically used for SIUV and confidences
|
||||
Confidence predictor mixin provides confidence estimates, as described in:
|
||||
N. Neverova et al., Correlated Uncertainty for Learning Dense Correspondences
|
||||
from Noisy Labels, NeurIPS 2019
|
||||
A. Sanakoyeu et al., Transferring Dense Pose to Proximal Animal Classes, CVPR 2020
|
||||
"""
|
||||
|
||||
def __init__(self, cfg: CfgNode, input_channels: int):
|
||||
"""
|
||||
Initialize confidence predictor using configuration options.
|
||||
|
||||
Args:
|
||||
cfg (CfgNode): configuration options
|
||||
input_channels (int): number of input channels
|
||||
"""
|
||||
# we rely on base predictor to call nn.Module.__init__
|
||||
super().__init__(cfg, input_channels)
|
||||
self.confidence_model_cfg = DensePoseConfidenceModelConfig.from_cfg(cfg)
|
||||
self._initialize_confidence_estimation_layers(cfg, input_channels)
|
||||
initialize_module_params(self)
|
||||
|
||||
def _initialize_confidence_estimation_layers(self, cfg: CfgNode, dim_in: int):
|
||||
"""
|
||||
Initialize confidence estimation layers based on configuration options
|
||||
|
||||
Args:
|
||||
cfg (CfgNode): configuration options
|
||||
dim_in (int): number of input channels
|
||||
"""
|
||||
dim_out_patches = cfg.MODEL.ROI_DENSEPOSE_HEAD.NUM_PATCHES + 1
|
||||
kernel_size = cfg.MODEL.ROI_DENSEPOSE_HEAD.DECONV_KERNEL
|
||||
if self.confidence_model_cfg.uv_confidence.enabled:
|
||||
if self.confidence_model_cfg.uv_confidence.type == DensePoseUVConfidenceType.IID_ISO:
|
||||
self.sigma_2_lowres = ConvTranspose2d(
|
||||
dim_in, dim_out_patches, kernel_size, stride=2, padding=int(kernel_size / 2 - 1)
|
||||
)
|
||||
elif (
|
||||
self.confidence_model_cfg.uv_confidence.type
|
||||
== DensePoseUVConfidenceType.INDEP_ANISO
|
||||
):
|
||||
self.sigma_2_lowres = ConvTranspose2d(
|
||||
dim_in, dim_out_patches, kernel_size, stride=2, padding=int(kernel_size / 2 - 1)
|
||||
)
|
||||
self.kappa_u_lowres = ConvTranspose2d(
|
||||
dim_in, dim_out_patches, kernel_size, stride=2, padding=int(kernel_size / 2 - 1)
|
||||
)
|
||||
self.kappa_v_lowres = ConvTranspose2d(
|
||||
dim_in, dim_out_patches, kernel_size, stride=2, padding=int(kernel_size / 2 - 1)
|
||||
)
|
||||
else:
|
||||
raise ValueError(
|
||||
f"Unknown confidence model type: "
|
||||
f"{self.confidence_model_cfg.confidence_model_type}"
|
||||
)
|
||||
if self.confidence_model_cfg.segm_confidence.enabled:
|
||||
self.fine_segm_confidence_lowres = ConvTranspose2d(
|
||||
dim_in, 1, kernel_size, stride=2, padding=int(kernel_size / 2 - 1)
|
||||
)
|
||||
self.coarse_segm_confidence_lowres = ConvTranspose2d(
|
||||
dim_in, 1, kernel_size, stride=2, padding=int(kernel_size / 2 - 1)
|
||||
)
|
||||
|
||||
def forward(self, head_outputs: torch.Tensor):
|
||||
"""
|
||||
Perform forward operation on head outputs used as inputs for the predictor.
|
||||
Calls forward method from the base predictor and uses its outputs to compute
|
||||
confidences.
|
||||
|
||||
Args:
|
||||
head_outputs (Tensor): head outputs used as predictor inputs
|
||||
Return:
|
||||
A tuple containing the following entries:
|
||||
- SIUV tuple with possibly modified segmentation tensors
|
||||
- various other outputs from the base predictor
|
||||
- 6 tensors with estimated confidence model parameters at full resolution
|
||||
(sigma_1, sigma_2, kappa_u, kappa_v, fine_segm_confidence, coarse_segm_confidence)
|
||||
- 6 tensors with estimated confidence model parameters at half resolution
|
||||
(sigma_1, sigma_2, kappa_u, kappa_v, fine_segm_confidence, coarse_segm_confidence)
|
||||
"""
|
||||
# assuming base class returns SIUV estimates in its first result
|
||||
base_predictor_outputs = super().forward(head_outputs)
|
||||
siuv = (
|
||||
base_predictor_outputs[0]
|
||||
if isinstance(base_predictor_outputs, tuple)
|
||||
else base_predictor_outputs
|
||||
)
|
||||
coarse_segm, fine_segm, u, v = siuv
|
||||
|
||||
sigma_1, sigma_2, kappa_u, kappa_v = None, None, None, None
|
||||
sigma_1_lowres, sigma_2_lowres, kappa_u_lowres, kappa_v_lowres = None, None, None, None
|
||||
fine_segm_confidence_lowres, fine_segm_confidence = None, None
|
||||
coarse_segm_confidence_lowres, coarse_segm_confidence = None, None
|
||||
if self.confidence_model_cfg.uv_confidence.enabled:
|
||||
if self.confidence_model_cfg.uv_confidence.type == DensePoseUVConfidenceType.IID_ISO:
|
||||
sigma_2_lowres = self.sigma_2_lowres(head_outputs)
|
||||
# assuming base class defines interp2d method for bilinear interpolation
|
||||
sigma_2 = self.interp2d(sigma_2_lowres)
|
||||
elif (
|
||||
self.confidence_model_cfg.uv_confidence.type
|
||||
== DensePoseUVConfidenceType.INDEP_ANISO
|
||||
):
|
||||
sigma_2_lowres = self.sigma_2_lowres(head_outputs)
|
||||
kappa_u_lowres = self.kappa_u_lowres(head_outputs)
|
||||
kappa_v_lowres = self.kappa_v_lowres(head_outputs)
|
||||
# assuming base class defines interp2d method for bilinear interpolation
|
||||
sigma_2 = self.interp2d(sigma_2_lowres)
|
||||
kappa_u = self.interp2d(kappa_u_lowres)
|
||||
kappa_v = self.interp2d(kappa_v_lowres)
|
||||
else:
|
||||
raise ValueError(
|
||||
f"Unknown confidence model type: "
|
||||
f"{self.confidence_model_cfg.confidence_model_type}"
|
||||
)
|
||||
if self.confidence_model_cfg.segm_confidence.enabled:
|
||||
fine_segm_confidence_lowres = self.fine_segm_confidence_lowres(head_outputs)
|
||||
# assuming base class defines interp2d method for bilinear interpolation
|
||||
fine_segm_confidence = self.interp2d(fine_segm_confidence_lowres)
|
||||
fine_segm_confidence = (
|
||||
F.softplus(fine_segm_confidence) + self.confidence_model_cfg.segm_confidence.epsilon
|
||||
)
|
||||
fine_segm = fine_segm * torch.repeat_interleave(
|
||||
fine_segm_confidence, fine_segm.shape[1], dim=1
|
||||
)
|
||||
coarse_segm_confidence_lowres = self.coarse_segm_confidence_lowres(head_outputs)
|
||||
# assuming base class defines interp2d method for bilinear interpolation
|
||||
coarse_segm_confidence = self.interp2d(coarse_segm_confidence_lowres)
|
||||
coarse_segm_confidence = (
|
||||
F.softplus(coarse_segm_confidence)
|
||||
+ self.confidence_model_cfg.segm_confidence.epsilon
|
||||
)
|
||||
coarse_segm = coarse_segm * torch.repeat_interleave(
|
||||
coarse_segm_confidence, coarse_segm.shape[1], dim=1
|
||||
)
|
||||
results = []
|
||||
# append SIUV with possibly modified segmentation tensors
|
||||
results.append((coarse_segm, fine_segm, u, v))
|
||||
# append the rest of base predictor outputs
|
||||
if isinstance(base_predictor_outputs, tuple):
|
||||
results.extend(base_predictor_outputs[1:])
|
||||
# append hi-res confidence estimates
|
||||
results.append(
|
||||
(sigma_1, sigma_2, kappa_u, kappa_v, fine_segm_confidence, coarse_segm_confidence)
|
||||
)
|
||||
# append lo-res confidence estimates
|
||||
results.append(
|
||||
(
|
||||
sigma_1_lowres,
|
||||
sigma_2_lowres,
|
||||
kappa_u_lowres,
|
||||
kappa_v_lowres,
|
||||
fine_segm_confidence_lowres,
|
||||
coarse_segm_confidence_lowres,
|
||||
)
|
||||
)
|
||||
return tuple(results)
|
|
@ -0,0 +1,13 @@
|
|||
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
|
||||
|
||||
from . import DensePoseChartConfidencePredictorMixin, DensePoseChartPredictor
|
||||
|
||||
|
||||
class DensePoseChartWithConfidencePredictor(
|
||||
DensePoseChartConfidencePredictorMixin, DensePoseChartPredictor
|
||||
):
|
||||
"""
|
||||
Predictor that combines chart and chart confidence estimation
|
||||
"""
|
||||
|
||||
pass
|
|
@ -0,0 +1,263 @@
|
|||
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
|
||||
|
||||
import fvcore.nn.weight_init as weight_init
|
||||
import torch
|
||||
from torch import nn
|
||||
from torch.nn import functional as F
|
||||
|
||||
from detectron2.config import CfgNode
|
||||
from detectron2.layers import Conv2d
|
||||
|
||||
from .registry import ROI_DENSEPOSE_HEAD_REGISTRY
|
||||
|
||||
|
||||
@ROI_DENSEPOSE_HEAD_REGISTRY.register()
|
||||
class DensePoseDeepLabHead(nn.Module):
|
||||
"""
|
||||
DensePose head using DeepLabV3 model from
|
||||
"Rethinking Atrous Convolution for Semantic Image Segmentation"
|
||||
<https://arxiv.org/abs/1706.05587>.
|
||||
"""
|
||||
|
||||
def __init__(self, cfg: CfgNode, input_channels: int):
|
||||
super(DensePoseDeepLabHead, self).__init__()
|
||||
# fmt: off
|
||||
hidden_dim = cfg.MODEL.ROI_DENSEPOSE_HEAD.CONV_HEAD_DIM
|
||||
kernel_size = cfg.MODEL.ROI_DENSEPOSE_HEAD.CONV_HEAD_KERNEL
|
||||
norm = cfg.MODEL.ROI_DENSEPOSE_HEAD.DEEPLAB.NORM
|
||||
self.n_stacked_convs = cfg.MODEL.ROI_DENSEPOSE_HEAD.NUM_STACKED_CONVS
|
||||
self.use_nonlocal = cfg.MODEL.ROI_DENSEPOSE_HEAD.DEEPLAB.NONLOCAL_ON
|
||||
# fmt: on
|
||||
pad_size = kernel_size // 2
|
||||
n_channels = input_channels
|
||||
|
||||
self.ASPP = ASPP(input_channels, [6, 12, 56], n_channels) # 6, 12, 56
|
||||
self.add_module("ASPP", self.ASPP)
|
||||
|
||||
if self.use_nonlocal:
|
||||
self.NLBlock = NONLocalBlock2D(input_channels, bn_layer=True)
|
||||
self.add_module("NLBlock", self.NLBlock)
|
||||
# weight_init.c2_msra_fill(self.ASPP)
|
||||
|
||||
for i in range(self.n_stacked_convs):
|
||||
norm_module = nn.GroupNorm(32, hidden_dim) if norm == "GN" else None
|
||||
layer = Conv2d(
|
||||
n_channels,
|
||||
hidden_dim,
|
||||
kernel_size,
|
||||
stride=1,
|
||||
padding=pad_size,
|
||||
bias=not norm,
|
||||
norm=norm_module,
|
||||
)
|
||||
weight_init.c2_msra_fill(layer)
|
||||
n_channels = hidden_dim
|
||||
layer_name = self._get_layer_name(i)
|
||||
self.add_module(layer_name, layer)
|
||||
self.n_out_channels = hidden_dim
|
||||
# initialize_module_params(self)
|
||||
|
||||
def forward(self, features):
|
||||
x0 = features
|
||||
x = self.ASPP(x0)
|
||||
if self.use_nonlocal:
|
||||
x = self.NLBlock(x)
|
||||
output = x
|
||||
for i in range(self.n_stacked_convs):
|
||||
layer_name = self._get_layer_name(i)
|
||||
x = getattr(self, layer_name)(x)
|
||||
x = F.relu(x)
|
||||
output = x
|
||||
return output
|
||||
|
||||
def _get_layer_name(self, i: int):
|
||||
layer_name = "body_conv_fcn{}".format(i + 1)
|
||||
return layer_name
|
||||
|
||||
|
||||
# Copied from
|
||||
# https://github.com/pytorch/vision/blob/master/torchvision/models/segmentation/deeplabv3.py
|
||||
# See https://arxiv.org/pdf/1706.05587.pdf for details
|
||||
class ASPPConv(nn.Sequential):
|
||||
def __init__(self, in_channels, out_channels, dilation):
|
||||
modules = [
|
||||
nn.Conv2d(
|
||||
in_channels, out_channels, 3, padding=dilation, dilation=dilation, bias=False
|
||||
),
|
||||
nn.GroupNorm(32, out_channels),
|
||||
nn.ReLU(),
|
||||
]
|
||||
super(ASPPConv, self).__init__(*modules)
|
||||
|
||||
|
||||
class ASPPPooling(nn.Sequential):
|
||||
def __init__(self, in_channels, out_channels):
|
||||
super(ASPPPooling, self).__init__(
|
||||
nn.AdaptiveAvgPool2d(1),
|
||||
nn.Conv2d(in_channels, out_channels, 1, bias=False),
|
||||
nn.GroupNorm(32, out_channels),
|
||||
nn.ReLU(),
|
||||
)
|
||||
|
||||
def forward(self, x):
|
||||
size = x.shape[-2:]
|
||||
x = super(ASPPPooling, self).forward(x)
|
||||
return F.interpolate(x, size=size, mode="bilinear", align_corners=False)
|
||||
|
||||
|
||||
class ASPP(nn.Module):
|
||||
def __init__(self, in_channels, atrous_rates, out_channels):
|
||||
super(ASPP, self).__init__()
|
||||
modules = []
|
||||
modules.append(
|
||||
nn.Sequential(
|
||||
nn.Conv2d(in_channels, out_channels, 1, bias=False),
|
||||
nn.GroupNorm(32, out_channels),
|
||||
nn.ReLU(),
|
||||
)
|
||||
)
|
||||
|
||||
rate1, rate2, rate3 = tuple(atrous_rates)
|
||||
modules.append(ASPPConv(in_channels, out_channels, rate1))
|
||||
modules.append(ASPPConv(in_channels, out_channels, rate2))
|
||||
modules.append(ASPPConv(in_channels, out_channels, rate3))
|
||||
modules.append(ASPPPooling(in_channels, out_channels))
|
||||
|
||||
self.convs = nn.ModuleList(modules)
|
||||
|
||||
self.project = nn.Sequential(
|
||||
nn.Conv2d(5 * out_channels, out_channels, 1, bias=False),
|
||||
# nn.BatchNorm2d(out_channels),
|
||||
nn.ReLU()
|
||||
# nn.Dropout(0.5)
|
||||
)
|
||||
|
||||
def forward(self, x):
|
||||
res = []
|
||||
for conv in self.convs:
|
||||
res.append(conv(x))
|
||||
res = torch.cat(res, dim=1)
|
||||
return self.project(res)
|
||||
|
||||
|
||||
# copied from
|
||||
# https://github.com/AlexHex7/Non-local_pytorch/blob/master/lib/non_local_embedded_gaussian.py
|
||||
# See https://arxiv.org/abs/1711.07971 for details
|
||||
class _NonLocalBlockND(nn.Module):
|
||||
def __init__(
|
||||
self, in_channels, inter_channels=None, dimension=3, sub_sample=True, bn_layer=True
|
||||
):
|
||||
super(_NonLocalBlockND, self).__init__()
|
||||
|
||||
assert dimension in [1, 2, 3]
|
||||
|
||||
self.dimension = dimension
|
||||
self.sub_sample = sub_sample
|
||||
|
||||
self.in_channels = in_channels
|
||||
self.inter_channels = inter_channels
|
||||
|
||||
if self.inter_channels is None:
|
||||
self.inter_channels = in_channels // 2
|
||||
if self.inter_channels == 0:
|
||||
self.inter_channels = 1
|
||||
|
||||
if dimension == 3:
|
||||
conv_nd = nn.Conv3d
|
||||
max_pool_layer = nn.MaxPool3d(kernel_size=(1, 2, 2))
|
||||
bn = nn.GroupNorm # (32, hidden_dim) #nn.BatchNorm3d
|
||||
elif dimension == 2:
|
||||
conv_nd = nn.Conv2d
|
||||
max_pool_layer = nn.MaxPool2d(kernel_size=(2, 2))
|
||||
bn = nn.GroupNorm # (32, hidden_dim)nn.BatchNorm2d
|
||||
else:
|
||||
conv_nd = nn.Conv1d
|
||||
max_pool_layer = nn.MaxPool1d(kernel_size=2)
|
||||
bn = nn.GroupNorm # (32, hidden_dim)nn.BatchNorm1d
|
||||
|
||||
self.g = conv_nd(
|
||||
in_channels=self.in_channels,
|
||||
out_channels=self.inter_channels,
|
||||
kernel_size=1,
|
||||
stride=1,
|
||||
padding=0,
|
||||
)
|
||||
|
||||
if bn_layer:
|
||||
self.W = nn.Sequential(
|
||||
conv_nd(
|
||||
in_channels=self.inter_channels,
|
||||
out_channels=self.in_channels,
|
||||
kernel_size=1,
|
||||
stride=1,
|
||||
padding=0,
|
||||
),
|
||||
bn(32, self.in_channels),
|
||||
)
|
||||
nn.init.constant_(self.W[1].weight, 0)
|
||||
nn.init.constant_(self.W[1].bias, 0)
|
||||
else:
|
||||
self.W = conv_nd(
|
||||
in_channels=self.inter_channels,
|
||||
out_channels=self.in_channels,
|
||||
kernel_size=1,
|
||||
stride=1,
|
||||
padding=0,
|
||||
)
|
||||
nn.init.constant_(self.W.weight, 0)
|
||||
nn.init.constant_(self.W.bias, 0)
|
||||
|
||||
self.theta = conv_nd(
|
||||
in_channels=self.in_channels,
|
||||
out_channels=self.inter_channels,
|
||||
kernel_size=1,
|
||||
stride=1,
|
||||
padding=0,
|
||||
)
|
||||
self.phi = conv_nd(
|
||||
in_channels=self.in_channels,
|
||||
out_channels=self.inter_channels,
|
||||
kernel_size=1,
|
||||
stride=1,
|
||||
padding=0,
|
||||
)
|
||||
|
||||
if sub_sample:
|
||||
self.g = nn.Sequential(self.g, max_pool_layer)
|
||||
self.phi = nn.Sequential(self.phi, max_pool_layer)
|
||||
|
||||
def forward(self, x):
|
||||
"""
|
||||
:param x: (b, c, t, h, w)
|
||||
:return:
|
||||
"""
|
||||
|
||||
batch_size = x.size(0)
|
||||
|
||||
g_x = self.g(x).view(batch_size, self.inter_channels, -1)
|
||||
g_x = g_x.permute(0, 2, 1)
|
||||
|
||||
theta_x = self.theta(x).view(batch_size, self.inter_channels, -1)
|
||||
theta_x = theta_x.permute(0, 2, 1)
|
||||
phi_x = self.phi(x).view(batch_size, self.inter_channels, -1)
|
||||
f = torch.matmul(theta_x, phi_x)
|
||||
f_div_C = F.softmax(f, dim=-1)
|
||||
|
||||
y = torch.matmul(f_div_C, g_x)
|
||||
y = y.permute(0, 2, 1).contiguous()
|
||||
y = y.view(batch_size, self.inter_channels, *x.size()[2:])
|
||||
W_y = self.W(y)
|
||||
z = W_y + x
|
||||
|
||||
return z
|
||||
|
||||
|
||||
class NONLocalBlock2D(_NonLocalBlockND):
|
||||
def __init__(self, in_channels, inter_channels=None, sub_sample=True, bn_layer=True):
|
||||
super(NONLocalBlock2D, self).__init__(
|
||||
in_channels,
|
||||
inter_channels=inter_channels,
|
||||
dimension=2,
|
||||
sub_sample=sub_sample,
|
||||
bn_layer=bn_layer,
|
||||
)
|
|
@ -0,0 +1,5 @@
|
|||
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
|
||||
|
||||
from detectron2.utils.registry import Registry
|
||||
|
||||
ROI_DENSEPOSE_HEAD_REGISTRY = Registry("ROI_DENSEPOSE_HEAD")
|
|
@ -0,0 +1,224 @@
|
|||
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
|
||||
|
||||
import numpy as np
|
||||
from typing import Dict, List, Optional
|
||||
import fvcore.nn.weight_init as weight_init
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
from torch.nn import functional as F
|
||||
|
||||
from detectron2.layers import Conv2d, ShapeSpec, get_norm
|
||||
from detectron2.modeling import ROI_HEADS_REGISTRY, StandardROIHeads
|
||||
from detectron2.modeling.poolers import ROIPooler
|
||||
from detectron2.modeling.roi_heads import select_foreground_proposals
|
||||
from detectron2.structures import ImageList, Instances
|
||||
|
||||
from .. import (
|
||||
build_densepose_data_filter,
|
||||
build_densepose_head,
|
||||
build_densepose_losses,
|
||||
build_densepose_predictor,
|
||||
densepose_inference,
|
||||
)
|
||||
|
||||
|
||||
class Decoder(nn.Module):
|
||||
"""
|
||||
A semantic segmentation head described in detail in the Panoptic Feature Pyramid Networks paper
|
||||
(https://arxiv.org/abs/1901.02446). It takes FPN features as input and merges information from
|
||||
all levels of the FPN into single output.
|
||||
"""
|
||||
|
||||
def __init__(self, cfg, input_shape: Dict[str, ShapeSpec], in_features):
|
||||
super(Decoder, self).__init__()
|
||||
|
||||
# fmt: off
|
||||
self.in_features = in_features
|
||||
feature_strides = {k: v.stride for k, v in input_shape.items()}
|
||||
feature_channels = {k: v.channels for k, v in input_shape.items()}
|
||||
num_classes = cfg.MODEL.ROI_DENSEPOSE_HEAD.DECODER_NUM_CLASSES
|
||||
conv_dims = cfg.MODEL.ROI_DENSEPOSE_HEAD.DECODER_CONV_DIMS
|
||||
self.common_stride = cfg.MODEL.ROI_DENSEPOSE_HEAD.DECODER_COMMON_STRIDE
|
||||
norm = cfg.MODEL.ROI_DENSEPOSE_HEAD.DECODER_NORM
|
||||
# fmt: on
|
||||
|
||||
self.scale_heads = []
|
||||
for in_feature in self.in_features:
|
||||
head_ops = []
|
||||
head_length = max(
|
||||
1, int(np.log2(feature_strides[in_feature]) - np.log2(self.common_stride))
|
||||
)
|
||||
for k in range(head_length):
|
||||
conv = Conv2d(
|
||||
feature_channels[in_feature] if k == 0 else conv_dims,
|
||||
conv_dims,
|
||||
kernel_size=3,
|
||||
stride=1,
|
||||
padding=1,
|
||||
bias=not norm,
|
||||
norm=get_norm(norm, conv_dims),
|
||||
activation=F.relu,
|
||||
)
|
||||
weight_init.c2_msra_fill(conv)
|
||||
head_ops.append(conv)
|
||||
if feature_strides[in_feature] != self.common_stride:
|
||||
head_ops.append(
|
||||
nn.Upsample(scale_factor=2, mode="bilinear", align_corners=False)
|
||||
)
|
||||
self.scale_heads.append(nn.Sequential(*head_ops))
|
||||
self.add_module(in_feature, self.scale_heads[-1])
|
||||
self.predictor = Conv2d(conv_dims, num_classes, kernel_size=1, stride=1, padding=0)
|
||||
weight_init.c2_msra_fill(self.predictor)
|
||||
|
||||
def forward(self, features: List[torch.Tensor]):
|
||||
for i, _ in enumerate(self.in_features):
|
||||
if i == 0:
|
||||
x = self.scale_heads[i](features[i])
|
||||
else:
|
||||
x = x + self.scale_heads[i](features[i])
|
||||
x = self.predictor(x)
|
||||
return x
|
||||
|
||||
|
||||
@ROI_HEADS_REGISTRY.register()
|
||||
class DensePoseROIHeads(StandardROIHeads):
|
||||
"""
|
||||
A Standard ROIHeads which contains an addition of DensePose head.
|
||||
"""
|
||||
|
||||
def __init__(self, cfg, input_shape):
|
||||
super().__init__(cfg, input_shape)
|
||||
self._init_densepose_head(cfg, input_shape)
|
||||
|
||||
def _init_densepose_head(self, cfg, input_shape):
|
||||
# fmt: off
|
||||
self.densepose_on = cfg.MODEL.DENSEPOSE_ON
|
||||
if not self.densepose_on:
|
||||
return
|
||||
self.densepose_data_filter = build_densepose_data_filter(cfg)
|
||||
dp_pooler_resolution = cfg.MODEL.ROI_DENSEPOSE_HEAD.POOLER_RESOLUTION
|
||||
dp_pooler_sampling_ratio = cfg.MODEL.ROI_DENSEPOSE_HEAD.POOLER_SAMPLING_RATIO
|
||||
dp_pooler_type = cfg.MODEL.ROI_DENSEPOSE_HEAD.POOLER_TYPE
|
||||
self.use_decoder = cfg.MODEL.ROI_DENSEPOSE_HEAD.DECODER_ON
|
||||
# fmt: on
|
||||
if self.use_decoder:
|
||||
dp_pooler_scales = (1.0 / input_shape[self.in_features[0]].stride,)
|
||||
else:
|
||||
dp_pooler_scales = tuple(1.0 / input_shape[k].stride for k in self.in_features)
|
||||
in_channels = [input_shape[f].channels for f in self.in_features][0]
|
||||
|
||||
if self.use_decoder:
|
||||
self.decoder = Decoder(cfg, input_shape, self.in_features)
|
||||
|
||||
self.densepose_pooler = ROIPooler(
|
||||
output_size=dp_pooler_resolution,
|
||||
scales=dp_pooler_scales,
|
||||
sampling_ratio=dp_pooler_sampling_ratio,
|
||||
pooler_type=dp_pooler_type,
|
||||
)
|
||||
self.densepose_head = build_densepose_head(cfg, in_channels)
|
||||
self.densepose_predictor = build_densepose_predictor(
|
||||
cfg, self.densepose_head.n_out_channels
|
||||
)
|
||||
self.densepose_losses = build_densepose_losses(cfg)
|
||||
|
||||
def _forward_densepose(self, features: Dict[str, torch.Tensor], instances: List[Instances]):
|
||||
"""
|
||||
Forward logic of the densepose prediction branch.
|
||||
|
||||
Args:
|
||||
features (dict[str, Tensor]): input data as a mapping from feature
|
||||
map name to tensor. Axis 0 represents the number of images `N` in
|
||||
the input data; axes 1-3 are channels, height, and width, which may
|
||||
vary between feature maps (e.g., if a feature pyramid is used).
|
||||
instances (list[Instances]): length `N` list of `Instances`. The i-th
|
||||
`Instances` contains instances for the i-th input image,
|
||||
In training, they can be the proposals.
|
||||
In inference, they can be the predicted boxes.
|
||||
|
||||
Returns:
|
||||
In training, a dict of losses.
|
||||
In inference, update `instances` with new fields "densepose" and return it.
|
||||
"""
|
||||
if not self.densepose_on:
|
||||
return {} if self.training else instances
|
||||
|
||||
features = [features[f] for f in self.in_features]
|
||||
if self.training:
|
||||
proposals, _ = select_foreground_proposals(instances, self.num_classes)
|
||||
features, proposals = self.densepose_data_filter(features, proposals)
|
||||
if len(proposals) > 0:
|
||||
proposal_boxes = [x.proposal_boxes for x in proposals]
|
||||
|
||||
if self.use_decoder:
|
||||
features = [self.decoder(features)]
|
||||
|
||||
features_dp = self.densepose_pooler(features, proposal_boxes)
|
||||
densepose_head_outputs = self.densepose_head(features_dp)
|
||||
densepose_outputs, _, confidences, _ = self.densepose_predictor(
|
||||
densepose_head_outputs
|
||||
)
|
||||
densepose_loss_dict = self.densepose_losses(
|
||||
proposals, densepose_outputs, confidences
|
||||
)
|
||||
return densepose_loss_dict
|
||||
else:
|
||||
pred_boxes = [x.pred_boxes for x in instances]
|
||||
|
||||
if self.use_decoder:
|
||||
features = [self.decoder(features)]
|
||||
|
||||
features_dp = self.densepose_pooler(features, pred_boxes)
|
||||
if len(features_dp) > 0:
|
||||
densepose_head_outputs = self.densepose_head(features_dp)
|
||||
densepose_outputs, _, confidences, _ = self.densepose_predictor(
|
||||
densepose_head_outputs
|
||||
)
|
||||
else:
|
||||
# If no detection occurred instances
|
||||
# set densepose_outputs to empty tensors
|
||||
empty_tensor = torch.zeros(size=(0, 0, 0, 0), device=features_dp.device)
|
||||
densepose_outputs = tuple([empty_tensor] * 4)
|
||||
confidences = tuple([empty_tensor] * 6)
|
||||
|
||||
densepose_inference(densepose_outputs, confidences, instances)
|
||||
return instances
|
||||
|
||||
def forward(
|
||||
self,
|
||||
images: ImageList,
|
||||
features: Dict[str, torch.Tensor],
|
||||
proposals: List[Instances],
|
||||
targets: Optional[List[Instances]] = None,
|
||||
):
|
||||
instances, losses = super().forward(images, features, proposals, targets)
|
||||
del targets, images
|
||||
|
||||
if self.training:
|
||||
losses.update(self._forward_densepose(features, instances))
|
||||
return instances, losses
|
||||
|
||||
def forward_with_given_boxes(
|
||||
self, features: Dict[str, torch.Tensor], instances: List[Instances]
|
||||
):
|
||||
"""
|
||||
Use the given boxes in `instances` to produce other (non-box) per-ROI outputs.
|
||||
|
||||
This is useful for downstream tasks where a box is known, but need to obtain
|
||||
other attributes (outputs of other heads).
|
||||
Test-time augmentation also uses this.
|
||||
|
||||
Args:
|
||||
features: same as in `forward()`
|
||||
instances (list[Instances]): instances to predict other outputs. Expect the keys
|
||||
"pred_boxes" and "pred_classes" to exist.
|
||||
|
||||
Returns:
|
||||
instances (list[Instances]):
|
||||
the same `Instances` objects, with extra
|
||||
fields such as `pred_masks` or `pred_keypoints`.
|
||||
"""
|
||||
|
||||
instances = super().forward_with_given_boxes(features, instances)
|
||||
instances = self._forward_densepose(features, instances)
|
||||
return instances
|
|
@ -0,0 +1,64 @@
|
|||
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
|
||||
|
||||
import torch
|
||||
from torch import nn
|
||||
from torch.nn import functional as F
|
||||
|
||||
from detectron2.config import CfgNode
|
||||
from detectron2.layers import Conv2d
|
||||
|
||||
from ..utils import initialize_module_params
|
||||
from .registry import ROI_DENSEPOSE_HEAD_REGISTRY
|
||||
|
||||
|
||||
@ROI_DENSEPOSE_HEAD_REGISTRY.register()
|
||||
class DensePoseV1ConvXHead(nn.Module):
|
||||
"""
|
||||
Fully convolutional DensePose head.
|
||||
"""
|
||||
|
||||
def __init__(self, cfg: CfgNode, input_channels: int):
|
||||
"""
|
||||
Initialize DensePose fully convolutional head
|
||||
|
||||
Args:
|
||||
cfg (CfgNode): configuration options
|
||||
input_channels (int): number of input channels
|
||||
"""
|
||||
super(DensePoseV1ConvXHead, self).__init__()
|
||||
# fmt: off
|
||||
hidden_dim = cfg.MODEL.ROI_DENSEPOSE_HEAD.CONV_HEAD_DIM
|
||||
kernel_size = cfg.MODEL.ROI_DENSEPOSE_HEAD.CONV_HEAD_KERNEL
|
||||
self.n_stacked_convs = cfg.MODEL.ROI_DENSEPOSE_HEAD.NUM_STACKED_CONVS
|
||||
# fmt: on
|
||||
pad_size = kernel_size // 2
|
||||
n_channels = input_channels
|
||||
for i in range(self.n_stacked_convs):
|
||||
layer = Conv2d(n_channels, hidden_dim, kernel_size, stride=1, padding=pad_size)
|
||||
layer_name = self._get_layer_name(i)
|
||||
self.add_module(layer_name, layer)
|
||||
n_channels = hidden_dim
|
||||
self.n_out_channels = n_channels
|
||||
initialize_module_params(self)
|
||||
|
||||
def forward(self, features: torch.Tensor):
|
||||
"""
|
||||
Apply DensePose fully convolutional head to the input features
|
||||
|
||||
Args:
|
||||
features (tensor): input features
|
||||
Result:
|
||||
A tensor of DensePose head outputs
|
||||
"""
|
||||
x = features
|
||||
output = x
|
||||
for i in range(self.n_stacked_convs):
|
||||
layer_name = self._get_layer_name(i)
|
||||
x = getattr(self, layer_name)(x)
|
||||
x = F.relu(x)
|
||||
output = x
|
||||
return output
|
||||
|
||||
def _get_layer_name(self, i: int):
|
||||
layer_name = "body_conv_fcn{}".format(i + 1)
|
||||
return layer_name
|
|
@ -0,0 +1,250 @@
|
|||
#!/usr/bin/env python3
|
||||
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
|
||||
|
||||
import argparse
|
||||
import logging
|
||||
import os
|
||||
import sys
|
||||
from timeit import default_timer as timer
|
||||
from typing import Any, ClassVar, Dict, List
|
||||
import torch
|
||||
from fvcore.common.file_io import PathManager
|
||||
|
||||
from detectron2.data.catalog import DatasetCatalog
|
||||
from detectron2.utils.logger import setup_logger
|
||||
|
||||
from densepose.data.structures import DensePoseDataRelative
|
||||
from densepose.utils.dbhelper import EntrySelector
|
||||
from densepose.utils.logger import verbosity_to_level
|
||||
from densepose.vis.base import CompoundVisualizer
|
||||
from densepose.vis.bounding_box import BoundingBoxVisualizer
|
||||
from densepose.vis.densepose import (
|
||||
DensePoseDataCoarseSegmentationVisualizer,
|
||||
DensePoseDataPointsIVisualizer,
|
||||
DensePoseDataPointsUVisualizer,
|
||||
DensePoseDataPointsVisualizer,
|
||||
DensePoseDataPointsVVisualizer,
|
||||
)
|
||||
|
||||
DOC = """Query DB - a tool to print / visualize data from a database
|
||||
"""
|
||||
|
||||
LOGGER_NAME = "query_db"
|
||||
|
||||
logger = logging.getLogger(LOGGER_NAME)
|
||||
|
||||
_ACTION_REGISTRY: Dict[str, "Action"] = {}
|
||||
|
||||
|
||||
class Action(object):
|
||||
@classmethod
|
||||
def add_arguments(cls: type, parser: argparse.ArgumentParser):
|
||||
parser.add_argument(
|
||||
"-v",
|
||||
"--verbosity",
|
||||
action="count",
|
||||
help="Verbose mode. Multiple -v options increase the verbosity.",
|
||||
)
|
||||
|
||||
|
||||
def register_action(cls: type):
|
||||
"""
|
||||
Decorator for action classes to automate action registration
|
||||
"""
|
||||
global _ACTION_REGISTRY
|
||||
_ACTION_REGISTRY[cls.COMMAND] = cls
|
||||
return cls
|
||||
|
||||
|
||||
class EntrywiseAction(Action):
|
||||
@classmethod
|
||||
def add_arguments(cls: type, parser: argparse.ArgumentParser):
|
||||
super(EntrywiseAction, cls).add_arguments(parser)
|
||||
parser.add_argument(
|
||||
"dataset", metavar="<dataset>", help="Dataset name (e.g. densepose_coco_2014_train)"
|
||||
)
|
||||
parser.add_argument(
|
||||
"selector",
|
||||
metavar="<selector>",
|
||||
help="Dataset entry selector in the form field1[:type]=value1[,"
|
||||
"field2[:type]=value_min-value_max...] which selects all "
|
||||
"entries from the dataset that satisfy the constraints",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--max-entries", metavar="N", help="Maximum number of entries to process", type=int
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def execute(cls: type, args: argparse.Namespace):
|
||||
dataset = setup_dataset(args.dataset)
|
||||
entry_selector = EntrySelector.from_string(args.selector)
|
||||
context = cls.create_context(args)
|
||||
if args.max_entries is not None:
|
||||
for _, entry in zip(range(args.max_entries), dataset):
|
||||
if entry_selector(entry):
|
||||
cls.execute_on_entry(entry, context)
|
||||
else:
|
||||
for entry in dataset:
|
||||
if entry_selector(entry):
|
||||
cls.execute_on_entry(entry, context)
|
||||
|
||||
@classmethod
|
||||
def create_context(cls: type, args: argparse.Namespace) -> Dict[str, Any]:
|
||||
context = {}
|
||||
return context
|
||||
|
||||
|
||||
@register_action
|
||||
class PrintAction(EntrywiseAction):
|
||||
"""
|
||||
Print action that outputs selected entries to stdout
|
||||
"""
|
||||
|
||||
COMMAND: ClassVar[str] = "print"
|
||||
|
||||
@classmethod
|
||||
def add_parser(cls: type, subparsers: argparse._SubParsersAction):
|
||||
parser = subparsers.add_parser(cls.COMMAND, help="Output selected entries to stdout. ")
|
||||
cls.add_arguments(parser)
|
||||
parser.set_defaults(func=cls.execute)
|
||||
|
||||
@classmethod
|
||||
def add_arguments(cls: type, parser: argparse.ArgumentParser):
|
||||
super(PrintAction, cls).add_arguments(parser)
|
||||
|
||||
@classmethod
|
||||
def execute_on_entry(cls: type, entry: Dict[str, Any], context: Dict[str, Any]):
|
||||
import pprint
|
||||
|
||||
printer = pprint.PrettyPrinter(indent=2, width=200, compact=True)
|
||||
printer.pprint(entry)
|
||||
|
||||
|
||||
@register_action
|
||||
class ShowAction(EntrywiseAction):
|
||||
"""
|
||||
Show action that visualizes selected entries on an image
|
||||
"""
|
||||
|
||||
COMMAND: ClassVar[str] = "show"
|
||||
VISUALIZERS: ClassVar[Dict[str, object]] = {
|
||||
"dp_segm": DensePoseDataCoarseSegmentationVisualizer(),
|
||||
"dp_i": DensePoseDataPointsIVisualizer(),
|
||||
"dp_u": DensePoseDataPointsUVisualizer(),
|
||||
"dp_v": DensePoseDataPointsVVisualizer(),
|
||||
"dp_pts": DensePoseDataPointsVisualizer(),
|
||||
"bbox": BoundingBoxVisualizer(),
|
||||
}
|
||||
|
||||
@classmethod
|
||||
def add_parser(cls: type, subparsers: argparse._SubParsersAction):
|
||||
parser = subparsers.add_parser(cls.COMMAND, help="Visualize selected entries")
|
||||
cls.add_arguments(parser)
|
||||
parser.set_defaults(func=cls.execute)
|
||||
|
||||
@classmethod
|
||||
def add_arguments(cls: type, parser: argparse.ArgumentParser):
|
||||
super(ShowAction, cls).add_arguments(parser)
|
||||
parser.add_argument(
|
||||
"visualizations",
|
||||
metavar="<visualizations>",
|
||||
help="Comma separated list of visualizations, possible values: "
|
||||
"[{}]".format(",".join(sorted(cls.VISUALIZERS.keys()))),
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output",
|
||||
metavar="<image_file>",
|
||||
default="output.png",
|
||||
help="File name to save output to",
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def execute_on_entry(cls: type, entry: Dict[str, Any], context: Dict[str, Any]):
|
||||
import cv2
|
||||
import numpy as np
|
||||
|
||||
image_fpath = PathManager.get_local_path(entry["file_name"])
|
||||
image = cv2.imread(image_fpath, cv2.IMREAD_GRAYSCALE)
|
||||
image = np.tile(image[:, :, np.newaxis], [1, 1, 3])
|
||||
datas = cls._extract_data_for_visualizers_from_entry(context["vis_specs"], entry)
|
||||
visualizer = context["visualizer"]
|
||||
image_vis = visualizer.visualize(image, datas)
|
||||
entry_idx = context["entry_idx"] + 1
|
||||
out_fname = cls._get_out_fname(entry_idx, context["out_fname"])
|
||||
cv2.imwrite(out_fname, image_vis)
|
||||
logger.info(f"Output saved to {out_fname}")
|
||||
context["entry_idx"] += 1
|
||||
|
||||
@classmethod
|
||||
def _get_out_fname(cls: type, entry_idx: int, fname_base: str):
|
||||
base, ext = os.path.splitext(fname_base)
|
||||
return base + ".{0:04d}".format(entry_idx) + ext
|
||||
|
||||
@classmethod
|
||||
def create_context(cls: type, args: argparse.Namespace) -> Dict[str, Any]:
|
||||
vis_specs = args.visualizations.split(",")
|
||||
visualizers = []
|
||||
for vis_spec in vis_specs:
|
||||
vis = cls.VISUALIZERS[vis_spec]
|
||||
visualizers.append(vis)
|
||||
context = {
|
||||
"vis_specs": vis_specs,
|
||||
"visualizer": CompoundVisualizer(visualizers),
|
||||
"out_fname": args.output,
|
||||
"entry_idx": 0,
|
||||
}
|
||||
return context
|
||||
|
||||
@classmethod
|
||||
def _extract_data_for_visualizers_from_entry(
|
||||
cls: type, vis_specs: List[str], entry: Dict[str, Any]
|
||||
):
|
||||
dp_list = []
|
||||
bbox_list = []
|
||||
for annotation in entry["annotations"]:
|
||||
is_valid, _ = DensePoseDataRelative.validate_annotation(annotation)
|
||||
if not is_valid:
|
||||
continue
|
||||
bbox = torch.as_tensor(annotation["bbox"])
|
||||
bbox_list.append(bbox)
|
||||
dp_data = DensePoseDataRelative(annotation)
|
||||
dp_list.append(dp_data)
|
||||
datas = []
|
||||
for vis_spec in vis_specs:
|
||||
datas.append(bbox_list if "bbox" == vis_spec else (bbox_list, dp_list))
|
||||
return datas
|
||||
|
||||
|
||||
def setup_dataset(dataset_name):
|
||||
logger.info("Loading dataset {}".format(dataset_name))
|
||||
start = timer()
|
||||
dataset = DatasetCatalog.get(dataset_name)
|
||||
stop = timer()
|
||||
logger.info("Loaded dataset {} in {:.3f}s".format(dataset_name, stop - start))
|
||||
return dataset
|
||||
|
||||
|
||||
def create_argument_parser() -> argparse.ArgumentParser:
|
||||
parser = argparse.ArgumentParser(
|
||||
description=DOC,
|
||||
formatter_class=lambda prog: argparse.HelpFormatter(prog, max_help_position=120),
|
||||
)
|
||||
parser.set_defaults(func=lambda _: parser.print_help(sys.stdout))
|
||||
subparsers = parser.add_subparsers(title="Actions")
|
||||
for _, action in _ACTION_REGISTRY.items():
|
||||
action.add_parser(subparsers)
|
||||
return parser
|
||||
|
||||
|
||||
def main():
|
||||
parser = create_argument_parser()
|
||||
args = parser.parse_args()
|
||||
verbosity = args.verbosity if hasattr(args, "verbosity") else None
|
||||
global logger
|
||||
logger = setup_logger(name=LOGGER_NAME)
|
||||
logger.setLevel(verbosity_to_level(verbosity))
|
||||
args.func(args)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
|
@ -0,0 +1,74 @@
|
|||
#!/usr/bin/env python3
|
||||
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
|
||||
|
||||
"""
|
||||
DensePose Training Script.
|
||||
|
||||
This script is similar to the training script in detectron2/tools.
|
||||
|
||||
It is an example of how a user might use detectron2 for a new project.
|
||||
"""
|
||||
|
||||
from fvcore.common.file_io import PathManager
|
||||
|
||||
import detectron2.utils.comm as comm
|
||||
from detectron2.config import get_cfg
|
||||
from detectron2.engine import default_argument_parser, default_setup, hooks, launch
|
||||
from detectron2.evaluation import verify_results
|
||||
from detectron2.utils.logger import setup_logger
|
||||
|
||||
from densepose import add_densepose_config
|
||||
from densepose.engine import Trainer
|
||||
from densepose.modeling.densepose_checkpoint import DensePoseCheckpointer
|
||||
|
||||
|
||||
def setup(args):
|
||||
cfg = get_cfg()
|
||||
add_densepose_config(cfg)
|
||||
cfg.merge_from_file(args.config_file)
|
||||
cfg.merge_from_list(args.opts)
|
||||
cfg.freeze()
|
||||
default_setup(cfg, args)
|
||||
# Setup logger for "densepose" module
|
||||
setup_logger(output=cfg.OUTPUT_DIR, distributed_rank=comm.get_rank(), name="densepose")
|
||||
return cfg
|
||||
|
||||
|
||||
def main(args):
|
||||
cfg = setup(args)
|
||||
# disable strict kwargs checking: allow one to specify path handle
|
||||
# hints through kwargs, like timeout in DP evaluation
|
||||
PathManager.set_strict_kwargs_checking(False)
|
||||
|
||||
if args.eval_only:
|
||||
model = Trainer.build_model(cfg)
|
||||
DensePoseCheckpointer(model, save_dir=cfg.OUTPUT_DIR).resume_or_load(
|
||||
cfg.MODEL.WEIGHTS, resume=args.resume
|
||||
)
|
||||
res = Trainer.test(cfg, model)
|
||||
if cfg.TEST.AUG.ENABLED:
|
||||
res.update(Trainer.test_with_TTA(cfg, model))
|
||||
if comm.is_main_process():
|
||||
verify_results(cfg, res)
|
||||
return res
|
||||
|
||||
trainer = Trainer(cfg)
|
||||
trainer.resume_or_load(resume=args.resume)
|
||||
if cfg.TEST.AUG.ENABLED:
|
||||
trainer.register_hooks(
|
||||
[hooks.EvalHook(0, lambda: trainer.test_with_TTA(cfg, trainer.model))]
|
||||
)
|
||||
return trainer.train()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
args = default_argument_parser().parse_args()
|
||||
print("Command Line Args:", args)
|
||||
launch(
|
||||
main,
|
||||
args.num_gpus,
|
||||
num_machines=args.num_machines,
|
||||
machine_rank=args.machine_rank,
|
||||
dist_url=args.dist_url,
|
||||
args=(args,),
|
||||
)
|
Loading…
Reference in New Issue