Add files via upload

main
RE-OWOD 2022-01-04 13:49:38 +08:00 committed by GitHub
parent 3c9dcce2c7
commit 28e2825941
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
100 changed files with 9052 additions and 0 deletions

View File

@ -1 +1,54 @@
# DensePose in Detectron2
**Dense Human Pose Estimation In The Wild**
_Rıza Alp Güler, Natalia Neverova, Iasonas Kokkinos_
[[`densepose.org`](https://densepose.org)] [[`arXiv`](https://arxiv.org/abs/1802.00434)] [[`BibTeX`](#CitingDensePose)]
Dense human pose estimation aims at mapping all human pixels of an RGB image to the 3D surface of the human body.
<div align="center">
<img src="https://drive.google.com/uc?export=view&id=1qfSOkpueo1kVZbXOuQJJhyagKjMgepsz" width="700px" />
</div>
In this repository, we provide the code to train and evaluate DensePose-RCNN. We also provide tools to visualize
DensePose annotation and results.
# Quick Start
See [ Getting Started ](doc/GETTING_STARTED.md)
# Model Zoo and Baselines
We provide a number of baseline results and trained models available for download. See [Model Zoo](doc/MODEL_ZOO.md) for details.
# License
Detectron2 is released under the [Apache 2.0 license](../../LICENSE)
## <a name="CitingDensePose"></a>Citing DensePose
If you use DensePose, please take the references from the following BibTeX entries:
For DensePose with estimated confidences:
```
@InProceedings{Neverova2019DensePoseConfidences,
title = {Correlated Uncertainty for Learning Dense Correspondences from Noisy Labels},
author = {Neverova, Natalia and Novotny, David and Vedaldi, Andrea},
journal = {Advances in Neural Information Processing Systems},
year = {2019},
}
```
For the original DensePose:
```
@InProceedings{Guler2018DensePose,
title={DensePose: Dense Human Pose Estimation In The Wild},
author={R\{i}za Alp G\"uler, Natalia Neverova, Iasonas Kokkinos},
journal={The IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
year={2018}
}
```

View File

@ -0,0 +1,319 @@
#!/usr/bin/env python3
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
import argparse
import glob
import logging
import os
import pickle
import sys
from typing import Any, ClassVar, Dict, List
import torch
from detectron2.config import get_cfg
from detectron2.data.detection_utils import read_image
from detectron2.engine.defaults import DefaultPredictor
from detectron2.structures.boxes import BoxMode
from detectron2.structures.instances import Instances
from detectron2.utils.logger import setup_logger
from densepose import add_densepose_config, add_hrnet_config
from densepose.utils.logger import verbosity_to_level
from densepose.vis.base import CompoundVisualizer
from densepose.vis.bounding_box import ScoredBoundingBoxVisualizer
from densepose.vis.densepose import (
DensePoseResultsContourVisualizer,
DensePoseResultsFineSegmentationVisualizer,
DensePoseResultsUVisualizer,
DensePoseResultsVVisualizer,
)
from densepose.vis.extractor import CompoundExtractor, create_extractor
DOC = """Apply Net - a tool to print / visualize DensePose results
"""
LOGGER_NAME = "apply_net"
logger = logging.getLogger(LOGGER_NAME)
_ACTION_REGISTRY: Dict[str, "Action"] = {}
class Action(object):
@classmethod
def add_arguments(cls: type, parser: argparse.ArgumentParser):
parser.add_argument(
"-v",
"--verbosity",
action="count",
help="Verbose mode. Multiple -v options increase the verbosity.",
)
def register_action(cls: type):
"""
Decorator for action classes to automate action registration
"""
global _ACTION_REGISTRY
_ACTION_REGISTRY[cls.COMMAND] = cls
return cls
class InferenceAction(Action):
@classmethod
def add_arguments(cls: type, parser: argparse.ArgumentParser):
super(InferenceAction, cls).add_arguments(parser)
parser.add_argument("cfg", metavar="<config>", help="Config file")
parser.add_argument("model", metavar="<model>", help="Model file")
parser.add_argument("input", metavar="<input>", help="Input data")
parser.add_argument(
"--opts",
help="Modify config options using the command-line 'KEY VALUE' pairs",
default=[],
nargs=argparse.REMAINDER,
)
@classmethod
def execute(cls: type, args: argparse.Namespace):
logger.info(f"Loading config from {args.cfg}")
opts = []
cfg = cls.setup_config(args.cfg, args.model, args, opts)
logger.info(f"Loading model from {args.model}")
predictor = DefaultPredictor(cfg)
logger.info(f"Loading data from {args.input}")
file_list = cls._get_input_file_list(args.input)
if len(file_list) == 0:
logger.warning(f"No input images for {args.input}")
return
context = cls.create_context(args)
for file_name in file_list:
img = read_image(file_name, format="BGR") # predictor expects BGR image.
with torch.no_grad():
outputs = predictor(img)["instances"]
cls.execute_on_outputs(context, {"file_name": file_name, "image": img}, outputs)
cls.postexecute(context)
@classmethod
def setup_config(
cls: type, config_fpath: str, model_fpath: str, args: argparse.Namespace, opts: List[str]
):
cfg = get_cfg()
add_densepose_config(cfg)
add_hrnet_config(cfg)
cfg.merge_from_file(config_fpath)
cfg.merge_from_list(args.opts)
if opts:
cfg.merge_from_list(opts)
cfg.MODEL.WEIGHTS = model_fpath
cfg.freeze()
return cfg
@classmethod
def _get_input_file_list(cls: type, input_spec: str):
if os.path.isdir(input_spec):
file_list = [
os.path.join(input_spec, fname)
for fname in os.listdir(input_spec)
if os.path.isfile(os.path.join(input_spec, fname))
]
elif os.path.isfile(input_spec):
file_list = [input_spec]
else:
file_list = glob.glob(input_spec)
return file_list
@register_action
class DumpAction(InferenceAction):
"""
Dump action that outputs results to a pickle file
"""
COMMAND: ClassVar[str] = "dump"
@classmethod
def add_parser(cls: type, subparsers: argparse._SubParsersAction):
parser = subparsers.add_parser(cls.COMMAND, help="Dump model outputs to a file.")
cls.add_arguments(parser)
parser.set_defaults(func=cls.execute)
@classmethod
def add_arguments(cls: type, parser: argparse.ArgumentParser):
super(DumpAction, cls).add_arguments(parser)
parser.add_argument(
"--output",
metavar="<dump_file>",
default="results.pkl",
help="File name to save dump to",
)
@classmethod
def execute_on_outputs(
cls: type, context: Dict[str, Any], entry: Dict[str, Any], outputs: Instances
):
image_fpath = entry["file_name"]
logger.info(f"Processing {image_fpath}")
result = {"file_name": image_fpath}
if outputs.has("scores"):
result["scores"] = outputs.get("scores").cpu()
if outputs.has("pred_boxes"):
result["pred_boxes_XYXY"] = outputs.get("pred_boxes").tensor.cpu()
if outputs.has("pred_densepose"):
boxes_XYWH = BoxMode.convert(
result["pred_boxes_XYXY"], BoxMode.XYXY_ABS, BoxMode.XYWH_ABS
)
result["pred_densepose"] = outputs.get("pred_densepose").to_result(boxes_XYWH)
context["results"].append(result)
@classmethod
def create_context(cls: type, args: argparse.Namespace):
context = {"results": [], "out_fname": args.output}
return context
@classmethod
def postexecute(cls: type, context: Dict[str, Any]):
out_fname = context["out_fname"]
out_dir = os.path.dirname(out_fname)
if len(out_dir) > 0 and not os.path.exists(out_dir):
os.makedirs(out_dir)
with open(out_fname, "wb") as hFile:
pickle.dump(context["results"], hFile)
logger.info(f"Output saved to {out_fname}")
@register_action
class ShowAction(InferenceAction):
"""
Show action that visualizes selected entries on an image
"""
COMMAND: ClassVar[str] = "show"
VISUALIZERS: ClassVar[Dict[str, object]] = {
"dp_contour": DensePoseResultsContourVisualizer,
"dp_segm": DensePoseResultsFineSegmentationVisualizer,
"dp_u": DensePoseResultsUVisualizer,
"dp_v": DensePoseResultsVVisualizer,
"bbox": ScoredBoundingBoxVisualizer,
}
@classmethod
def add_parser(cls: type, subparsers: argparse._SubParsersAction):
parser = subparsers.add_parser(cls.COMMAND, help="Visualize selected entries")
cls.add_arguments(parser)
parser.set_defaults(func=cls.execute)
@classmethod
def add_arguments(cls: type, parser: argparse.ArgumentParser):
super(ShowAction, cls).add_arguments(parser)
parser.add_argument(
"visualizations",
metavar="<visualizations>",
help="Comma separated list of visualizations, possible values: "
"[{}]".format(",".join(sorted(cls.VISUALIZERS.keys()))),
)
parser.add_argument(
"--min_score",
metavar="<score>",
default=0.8,
type=float,
help="Minimum detection score to visualize",
)
parser.add_argument(
"--nms_thresh", metavar="<threshold>", default=None, type=float, help="NMS threshold"
)
parser.add_argument(
"--output",
metavar="<image_file>",
default="outputres.png",
help="File name to save output to",
)
@classmethod
def setup_config(
cls: type, config_fpath: str, model_fpath: str, args: argparse.Namespace, opts: List[str]
):
opts.append("MODEL.ROI_HEADS.SCORE_THRESH_TEST")
opts.append(str(args.min_score))
if args.nms_thresh is not None:
opts.append("MODEL.ROI_HEADS.NMS_THRESH_TEST")
opts.append(str(args.nms_thresh))
cfg = super(ShowAction, cls).setup_config(config_fpath, model_fpath, args, opts)
return cfg
@classmethod
def execute_on_outputs(
cls: type, context: Dict[str, Any], entry: Dict[str, Any], outputs: Instances
):
import cv2
import numpy as np
visualizer = context["visualizer"]
extractor = context["extractor"]
image_fpath = entry["file_name"]
logger.info(f"Processing {image_fpath}")
image = cv2.cvtColor(entry["image"], cv2.COLOR_BGR2GRAY)
image = np.tile(image[:, :, np.newaxis], [1, 1, 3])
data = extractor(outputs)
image_vis = visualizer.visualize(image, data)
entry_idx = context["entry_idx"] + 1
out_fname = cls._get_out_fname(entry_idx, context["out_fname"])
out_dir = os.path.dirname(out_fname)
if len(out_dir) > 0 and not os.path.exists(out_dir):
os.makedirs(out_dir)
cv2.imwrite(out_fname, image_vis)
logger.info(f"Output saved to {out_fname}")
context["entry_idx"] += 1
@classmethod
def postexecute(cls: type, context: Dict[str, Any]):
pass
@classmethod
def _get_out_fname(cls: type, entry_idx: int, fname_base: str):
base, ext = os.path.splitext(fname_base)
return base + ".{0:04d}".format(entry_idx) + ext
@classmethod
def create_context(cls: type, args: argparse.Namespace) -> Dict[str, Any]:
vis_specs = args.visualizations.split(",")
visualizers = []
extractors = []
for vis_spec in vis_specs:
vis = cls.VISUALIZERS[vis_spec]()
visualizers.append(vis)
extractor = create_extractor(vis)
extractors.append(extractor)
visualizer = CompoundVisualizer(visualizers)
extractor = CompoundExtractor(extractors)
context = {
"extractor": extractor,
"visualizer": visualizer,
"out_fname": args.output,
"entry_idx": 0,
}
return context
def create_argument_parser() -> argparse.ArgumentParser:
parser = argparse.ArgumentParser(
description=DOC,
formatter_class=lambda prog: argparse.HelpFormatter(prog, max_help_position=120),
)
parser.set_defaults(func=lambda _: parser.print_help(sys.stdout))
subparsers = parser.add_subparsers(title="Actions")
for _, action in _ACTION_REGISTRY.items():
action.add_parser(subparsers)
return parser
def main():
parser = create_argument_parser()
args = parser.parse_args()
verbosity = args.verbosity if hasattr(args, "verbosity") else None
global logger
logger = setup_logger(name=LOGGER_NAME)
logger.setLevel(verbosity_to_level(verbosity))
args.func(args)
if __name__ == "__main__":
main()

View File

@ -0,0 +1,48 @@
VERSION: 2
MODEL:
META_ARCHITECTURE: "GeneralizedRCNN"
BACKBONE:
NAME: "build_resnet_fpn_backbone"
RESNETS:
OUT_FEATURES: ["res2", "res3", "res4", "res5"]
FPN:
IN_FEATURES: ["res2", "res3", "res4", "res5"]
ANCHOR_GENERATOR:
SIZES: [[32], [64], [128], [256], [512]] # One size for each in feature map
ASPECT_RATIOS: [[0.5, 1.0, 2.0]] # Three aspect ratios (same for all in feature maps)
RPN:
IN_FEATURES: ["p2", "p3", "p4", "p5", "p6"]
PRE_NMS_TOPK_TRAIN: 2000 # Per FPN level
PRE_NMS_TOPK_TEST: 1000 # Per FPN level
# Detectron1 uses 2000 proposals per-batch,
# (See "modeling/rpn/rpn_outputs.py" for details of this legacy issue)
# which is approximately 1000 proposals per-image since the default batch size for FPN is 2.
POST_NMS_TOPK_TRAIN: 1000
POST_NMS_TOPK_TEST: 1000
DENSEPOSE_ON: True
ROI_HEADS:
NAME: "DensePoseROIHeads"
IN_FEATURES: ["p2", "p3", "p4", "p5"]
NUM_CLASSES: 1
ROI_BOX_HEAD:
NAME: "FastRCNNConvFCHead"
NUM_FC: 2
POOLER_RESOLUTION: 7
POOLER_SAMPLING_RATIO: 2
POOLER_TYPE: "ROIAlign"
ROI_DENSEPOSE_HEAD:
NAME: "DensePoseV1ConvXHead"
POOLER_TYPE: "ROIAlign"
NUM_COARSE_SEGM_CHANNELS: 2
DATASETS:
TRAIN: ("densepose_coco_2014_train", "densepose_coco_2014_valminusminival")
TEST: ("densepose_coco_2014_minival",)
SOLVER:
IMS_PER_BATCH: 16
BASE_LR: 0.01
STEPS: (60000, 80000)
MAX_ITER: 90000
WARMUP_FACTOR: 0.1
INPUT:
MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)

View File

@ -0,0 +1,16 @@
_BASE_: "../Base-DensePose-RCNN-FPN.yaml"
MODEL:
WEIGHTS: "https://1drv.ms/u/s!Aus8VCZ_C_33dYBMemi9xOUFR0w"
BACKBONE:
NAME: "build_hrfpn_backbone"
RPN:
IN_FEATURES: ['p1', 'p2', 'p3', 'p4', 'p5']
ROI_HEADS:
IN_FEATURES: ['p1', 'p2', 'p3', 'p4', 'p5']
SOLVER:
MAX_ITER: 130000
STEPS: (100000, 120000)
CLIP_GRADIENTS:
ENABLED: True
CLIP_TYPE: "norm"
BASE_LR: 0.03

View File

@ -0,0 +1,23 @@
_BASE_: "../Base-DensePose-RCNN-FPN.yaml"
MODEL:
WEIGHTS: "https://1drv.ms/u/s!Aus8VCZ_C_33ck0gvo5jfoWBOPo"
BACKBONE:
NAME: "build_hrfpn_backbone"
RPN:
IN_FEATURES: ['p1', 'p2', 'p3', 'p4', 'p5']
ROI_HEADS:
IN_FEATURES: ['p1', 'p2', 'p3', 'p4', 'p5']
HRNET:
STAGE2:
NUM_CHANNELS: [40, 80]
STAGE3:
NUM_CHANNELS: [40, 80, 160]
STAGE4:
NUM_CHANNELS: [40, 80, 160, 320]
SOLVER:
MAX_ITER: 130000
STEPS: (100000, 120000)
CLIP_GRADIENTS:
ENABLED: True
CLIP_TYPE: "norm"
BASE_LR: 0.03

View File

@ -0,0 +1,23 @@
_BASE_: "../Base-DensePose-RCNN-FPN.yaml"
MODEL:
WEIGHTS: "https://1drv.ms/u/s!Aus8VCZ_C_33dKvqI6pBZlifgJk"
BACKBONE:
NAME: "build_hrfpn_backbone"
RPN:
IN_FEATURES: ['p1', 'p2', 'p3', 'p4', 'p5']
ROI_HEADS:
IN_FEATURES: ['p1', 'p2', 'p3', 'p4', 'p5']
HRNET:
STAGE2:
NUM_CHANNELS: [48, 96]
STAGE3:
NUM_CHANNELS: [48, 96, 192]
STAGE4:
NUM_CHANNELS: [48, 96, 192, 384]
SOLVER:
MAX_ITER: 130000
STEPS: (100000, 120000)
CLIP_GRADIENTS:
ENABLED: True
CLIP_TYPE: "norm"
BASE_LR: 0.03

View File

@ -0,0 +1,18 @@
_BASE_: "Base-DensePose-RCNN-FPN.yaml"
MODEL:
WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-101.pkl"
RESNETS:
DEPTH: 101
ROI_DENSEPOSE_HEAD:
NAME: "DensePoseDeepLabHead"
UV_CONFIDENCE:
ENABLED: True
TYPE: "iid_iso"
SEGM_CONFIDENCE:
ENABLED: True
POINT_REGRESSION_WEIGHTS: 0.0005
SOLVER:
CLIP_GRADIENTS:
ENABLED: True
MAX_ITER: 130000
STEPS: (100000, 120000)

View File

@ -0,0 +1,16 @@
_BASE_: "Base-DensePose-RCNN-FPN.yaml"
MODEL:
WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-101.pkl"
RESNETS:
DEPTH: 101
ROI_DENSEPOSE_HEAD:
NAME: "DensePoseDeepLabHead"
UV_CONFIDENCE:
ENABLED: True
TYPE: "iid_iso"
POINT_REGRESSION_WEIGHTS: 0.0005
SOLVER:
CLIP_GRADIENTS:
ENABLED: True
MAX_ITER: 130000
STEPS: (100000, 120000)

View File

@ -0,0 +1,18 @@
_BASE_: "Base-DensePose-RCNN-FPN.yaml"
MODEL:
WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-101.pkl"
RESNETS:
DEPTH: 101
ROI_DENSEPOSE_HEAD:
NAME: "DensePoseDeepLabHead"
UV_CONFIDENCE:
ENABLED: True
TYPE: "indep_aniso"
SEGM_CONFIDENCE:
ENABLED: True
POINT_REGRESSION_WEIGHTS: 0.0005
SOLVER:
CLIP_GRADIENTS:
ENABLED: True
MAX_ITER: 130000
STEPS: (100000, 120000)

View File

@ -0,0 +1,16 @@
_BASE_: "Base-DensePose-RCNN-FPN.yaml"
MODEL:
WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-101.pkl"
RESNETS:
DEPTH: 101
ROI_DENSEPOSE_HEAD:
NAME: "DensePoseDeepLabHead"
UV_CONFIDENCE:
ENABLED: True
TYPE: "indep_aniso"
POINT_REGRESSION_WEIGHTS: 0.0005
SOLVER:
CLIP_GRADIENTS:
ENABLED: True
MAX_ITER: 130000
STEPS: (100000, 120000)

View File

@ -0,0 +1,10 @@
_BASE_: "Base-DensePose-RCNN-FPN.yaml"
MODEL:
WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-101.pkl"
RESNETS:
DEPTH: 101
ROI_DENSEPOSE_HEAD:
NAME: "DensePoseDeepLabHead"
SOLVER:
MAX_ITER: 130000
STEPS: (100000, 120000)

View File

@ -0,0 +1,18 @@
_BASE_: "Base-DensePose-RCNN-FPN.yaml"
MODEL:
WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-101.pkl"
RESNETS:
DEPTH: 101
ROI_DENSEPOSE_HEAD:
UV_CONFIDENCE:
ENABLED: True
TYPE: "iid_iso"
SEGM_CONFIDENCE:
ENABLED: True
POINT_REGRESSION_WEIGHTS: 0.0005
SOLVER:
CLIP_GRADIENTS:
ENABLED: True
MAX_ITER: 130000
STEPS: (100000, 120000)
WARMUP_FACTOR: 0.025

View File

@ -0,0 +1,16 @@
_BASE_: "Base-DensePose-RCNN-FPN.yaml"
MODEL:
WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-101.pkl"
RESNETS:
DEPTH: 101
ROI_DENSEPOSE_HEAD:
UV_CONFIDENCE:
ENABLED: True
TYPE: "iid_iso"
POINT_REGRESSION_WEIGHTS: 0.0005
SOLVER:
CLIP_GRADIENTS:
ENABLED: True
MAX_ITER: 130000
STEPS: (100000, 120000)
WARMUP_FACTOR: 0.025

View File

@ -0,0 +1,18 @@
_BASE_: "Base-DensePose-RCNN-FPN.yaml"
MODEL:
WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-101.pkl"
RESNETS:
DEPTH: 101
ROI_DENSEPOSE_HEAD:
UV_CONFIDENCE:
ENABLED: True
TYPE: "indep_aniso"
SEGM_CONFIDENCE:
ENABLED: True
POINT_REGRESSION_WEIGHTS: 0.0005
SOLVER:
CLIP_GRADIENTS:
ENABLED: True
MAX_ITER: 130000
STEPS: (100000, 120000)
WARMUP_FACTOR: 0.025

View File

@ -0,0 +1,16 @@
_BASE_: "Base-DensePose-RCNN-FPN.yaml"
MODEL:
WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-101.pkl"
RESNETS:
DEPTH: 101
ROI_DENSEPOSE_HEAD:
UV_CONFIDENCE:
ENABLED: True
TYPE: "indep_aniso"
POINT_REGRESSION_WEIGHTS: 0.0005
SOLVER:
CLIP_GRADIENTS:
ENABLED: True
MAX_ITER: 130000
STEPS: (100000, 120000)
WARMUP_FACTOR: 0.025

View File

@ -0,0 +1,8 @@
_BASE_: "Base-DensePose-RCNN-FPN.yaml"
MODEL:
WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-101.pkl"
RESNETS:
DEPTH: 101
SOLVER:
MAX_ITER: 130000
STEPS: (100000, 120000)

View File

@ -0,0 +1,17 @@
_BASE_: "Base-DensePose-RCNN-FPN.yaml"
MODEL:
WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-101.pkl"
RESNETS:
DEPTH: 101
ROI_DENSEPOSE_HEAD:
NUM_COARSE_SEGM_CHANNELS: 15
POOLER_RESOLUTION: 14
HEATMAP_SIZE: 56
INDEX_WEIGHTS: 2.0
PART_WEIGHTS: 0.3
POINT_REGRESSION_WEIGHTS: 0.1
DECODER_ON: False
SOLVER:
BASE_LR: 0.002
MAX_ITER: 130000
STEPS: (100000, 120000)

View File

@ -0,0 +1,18 @@
_BASE_: "Base-DensePose-RCNN-FPN.yaml"
MODEL:
WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
RESNETS:
DEPTH: 50
ROI_DENSEPOSE_HEAD:
NAME: "DensePoseDeepLabHead"
UV_CONFIDENCE:
ENABLED: True
TYPE: "iid_iso"
SEGM_CONFIDENCE:
ENABLED: True
POINT_REGRESSION_WEIGHTS: 0.0005
SOLVER:
CLIP_GRADIENTS:
ENABLED: True
MAX_ITER: 130000
STEPS: (100000, 120000)

View File

@ -0,0 +1,16 @@
_BASE_: "Base-DensePose-RCNN-FPN.yaml"
MODEL:
WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
RESNETS:
DEPTH: 50
ROI_DENSEPOSE_HEAD:
NAME: "DensePoseDeepLabHead"
UV_CONFIDENCE:
ENABLED: True
TYPE: "iid_iso"
POINT_REGRESSION_WEIGHTS: 0.0005
SOLVER:
CLIP_GRADIENTS:
ENABLED: True
MAX_ITER: 130000
STEPS: (100000, 120000)

View File

@ -0,0 +1,18 @@
_BASE_: "Base-DensePose-RCNN-FPN.yaml"
MODEL:
WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
RESNETS:
DEPTH: 50
ROI_DENSEPOSE_HEAD:
NAME: "DensePoseDeepLabHead"
UV_CONFIDENCE:
ENABLED: True
TYPE: "indep_aniso"
SEGM_CONFIDENCE:
ENABLED: True
POINT_REGRESSION_WEIGHTS: 0.0005
SOLVER:
CLIP_GRADIENTS:
ENABLED: True
MAX_ITER: 130000
STEPS: (100000, 120000)

View File

@ -0,0 +1,16 @@
_BASE_: "Base-DensePose-RCNN-FPN.yaml"
MODEL:
WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
RESNETS:
DEPTH: 50
ROI_DENSEPOSE_HEAD:
NAME: "DensePoseDeepLabHead"
UV_CONFIDENCE:
ENABLED: True
TYPE: "indep_aniso"
POINT_REGRESSION_WEIGHTS: 0.0005
SOLVER:
CLIP_GRADIENTS:
ENABLED: True
MAX_ITER: 130000
STEPS: (100000, 120000)

View File

@ -0,0 +1,10 @@
_BASE_: "Base-DensePose-RCNN-FPN.yaml"
MODEL:
WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
RESNETS:
DEPTH: 50
ROI_DENSEPOSE_HEAD:
NAME: "DensePoseDeepLabHead"
SOLVER:
MAX_ITER: 130000
STEPS: (100000, 120000)

View File

@ -0,0 +1,20 @@
_BASE_: "Base-DensePose-RCNN-FPN.yaml"
MODEL:
WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
RESNETS:
DEPTH: 50
ROI_DENSEPOSE_HEAD:
UV_CONFIDENCE:
ENABLED: True
TYPE: "iid_iso"
SEGM_CONFIDENCE:
ENABLED: True
POINT_REGRESSION_WEIGHTS: 0.0005
SOLVER:
CLIP_GRADIENTS:
ENABLED: True
CLIP_TYPE: norm
CLIP_VALUE: 100.0
MAX_ITER: 130000
STEPS: (100000, 120000)
WARMUP_FACTOR: 0.025

View File

@ -0,0 +1,16 @@
_BASE_: "Base-DensePose-RCNN-FPN.yaml"
MODEL:
WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
RESNETS:
DEPTH: 50
ROI_DENSEPOSE_HEAD:
UV_CONFIDENCE:
ENABLED: True
TYPE: "iid_iso"
POINT_REGRESSION_WEIGHTS: 0.0005
SOLVER:
CLIP_GRADIENTS:
ENABLED: True
MAX_ITER: 130000
STEPS: (100000, 120000)
WARMUP_FACTOR: 0.025

View File

@ -0,0 +1,18 @@
_BASE_: "Base-DensePose-RCNN-FPN.yaml"
MODEL:
WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
RESNETS:
DEPTH: 50
ROI_DENSEPOSE_HEAD:
UV_CONFIDENCE:
ENABLED: True
TYPE: "indep_aniso"
SEGM_CONFIDENCE:
ENABLED: True
POINT_REGRESSION_WEIGHTS: 0.0005
SOLVER:
CLIP_GRADIENTS:
ENABLED: True
MAX_ITER: 130000
STEPS: (100000, 120000)
WARMUP_FACTOR: 0.025

View File

@ -0,0 +1,16 @@
_BASE_: "Base-DensePose-RCNN-FPN.yaml"
MODEL:
WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
RESNETS:
DEPTH: 50
ROI_DENSEPOSE_HEAD:
UV_CONFIDENCE:
ENABLED: True
TYPE: "indep_aniso"
POINT_REGRESSION_WEIGHTS: 0.0005
SOLVER:
CLIP_GRADIENTS:
ENABLED: True
MAX_ITER: 130000
STEPS: (100000, 120000)
WARMUP_FACTOR: 0.025

View File

@ -0,0 +1,8 @@
_BASE_: "Base-DensePose-RCNN-FPN.yaml"
MODEL:
WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
RESNETS:
DEPTH: 50
SOLVER:
MAX_ITER: 130000
STEPS: (100000, 120000)

View File

@ -0,0 +1,17 @@
_BASE_: "Base-DensePose-RCNN-FPN.yaml"
MODEL:
WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
RESNETS:
DEPTH: 50
ROI_DENSEPOSE_HEAD:
NUM_COARSE_SEGM_CHANNELS: 15
POOLER_RESOLUTION: 14
HEATMAP_SIZE: 56
INDEX_WEIGHTS: 2.0
PART_WEIGHTS: 0.3
POINT_REGRESSION_WEIGHTS: 0.1
DECODER_ON: False
SOLVER:
BASE_LR: 0.002
MAX_ITER: 130000
STEPS: (100000, 120000)

View File

@ -0,0 +1,121 @@
MODEL:
META_ARCHITECTURE: "GeneralizedRCNN"
BACKBONE:
NAME: "build_resnet_fpn_backbone"
RESNETS:
OUT_FEATURES: ["res2", "res3", "res4", "res5"]
FPN:
IN_FEATURES: ["res2", "res3", "res4", "res5"]
ANCHOR_GENERATOR:
SIZES: [[32], [64], [128], [256], [512]] # One size for each in feature map
ASPECT_RATIOS: [[0.5, 1.0, 2.0]] # Three aspect ratios (same for all in feature maps)
RPN:
IN_FEATURES: ["p2", "p3", "p4", "p5", "p6"]
PRE_NMS_TOPK_TRAIN: 2000 # Per FPN level
PRE_NMS_TOPK_TEST: 1000 # Per FPN level
# Detectron1 uses 2000 proposals per-batch,
# (See "modeling/rpn/rpn_outputs.py" for details of this legacy issue)
# which is approximately 1000 proposals per-image since the default batch size for FPN is 2.
POST_NMS_TOPK_TRAIN: 1000
POST_NMS_TOPK_TEST: 1000
ROI_HEADS:
NAME: "StandardROIHeads"
IN_FEATURES: ["p2", "p3", "p4", "p5"]
NUM_CLASSES: 1
ROI_BOX_HEAD:
NAME: "FastRCNNConvFCHead"
NUM_FC: 2
POOLER_RESOLUTION: 7
ROI_MASK_HEAD:
NAME: "MaskRCNNConvUpsampleHead"
NUM_CONV: 4
POOLER_RESOLUTION: 14
DATASETS:
TRAIN: ("base_coco_2017_train",)
TEST: ("base_coco_2017_val", "densepose_chimps")
CATEGORY_MAPS:
"base_coco_2017_train":
"16": 1 # bird -> person
"17": 1 # cat -> person
"18": 1 # dog -> person
"19": 1 # horse -> person
"20": 1 # sheep -> person
"21": 1 # cow -> person
"22": 1 # elephant -> person
"23": 1 # bear -> person
"24": 1 # zebra -> person
"25": 1 # girafe -> person
"base_coco_2017_val":
"16": 1 # bird -> person
"17": 1 # cat -> person
"18": 1 # dog -> person
"19": 1 # horse -> person
"20": 1 # sheep -> person
"21": 1 # cow -> person
"22": 1 # elephant -> person
"23": 1 # bear -> person
"24": 1 # zebra -> person
"25": 1 # girafe -> person
WHITELISTED_CATEGORIES:
"base_coco_2017_train":
- 1 # person
- 16 # bird
- 17 # cat
- 18 # dog
- 19 # horse
- 20 # sheep
- 21 # cow
- 22 # elephant
- 23 # bear
- 24 # zebra
- 25 # girafe
"base_coco_2017_val":
- 1 # person
- 16 # bird
- 17 # cat
- 18 # dog
- 19 # horse
- 20 # sheep
- 21 # cow
- 22 # elephant
- 23 # bear
- 24 # zebra
- 25 # girafe
BOOTSTRAP_DATASETS:
- DATASET: "chimpnsee"
RATIO: 1.0
IMAGE_LOADER:
TYPE: "video_keyframe"
SELECT:
STRATEGY: "random_k"
NUM_IMAGES: 4
TRANSFORM:
TYPE: "resize"
MIN_SIZE: 800
MAX_SIZE: 1333
BATCH_SIZE: 8
NUM_WORKERS: 1
INFERENCE:
INPUT_BATCH_SIZE: 1
OUTPUT_BATCH_SIZE: 1
DATA_SAMPLER:
# supported types:
# densepose_uniform
# densepose_UV_confidence
# densepose_fine_segm_confidence
# densepose_coarse_segm_confidence
TYPE: "densepose_uniform"
COUNT_PER_CLASS: 8
FILTER:
TYPE: "detection_score"
MIN_VALUE: 0.8
BOOTSTRAP_MODEL:
WEIGHTS: ""
SOLVER:
IMS_PER_BATCH: 16
BASE_LR: 0.02
STEPS: (60000, 80000)
MAX_ITER: 90000
INPUT:
MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
VERSION: 2

View File

@ -0,0 +1,91 @@
MODEL:
META_ARCHITECTURE: "GeneralizedRCNN"
BACKBONE:
NAME: "build_resnet_fpn_backbone"
RESNETS:
OUT_FEATURES: ["res2", "res3", "res4", "res5"]
FPN:
IN_FEATURES: ["res2", "res3", "res4", "res5"]
ANCHOR_GENERATOR:
SIZES: [[32], [64], [128], [256], [512]] # One size for each in feature map
ASPECT_RATIOS: [[0.5, 1.0, 2.0]] # Three aspect ratios (same for all in feature maps)
RPN:
IN_FEATURES: ["p2", "p3", "p4", "p5", "p6"]
PRE_NMS_TOPK_TRAIN: 2000 # Per FPN level
PRE_NMS_TOPK_TEST: 1000 # Per FPN level
# Detectron1 uses 2000 proposals per-batch,
# (See "modeling/rpn/rpn_outputs.py" for details of this legacy issue)
# which is approximately 1000 proposals per-image since the default batch size for FPN is 2.
POST_NMS_TOPK_TRAIN: 1000
POST_NMS_TOPK_TEST: 1000
ROI_HEADS:
NAME: "StandardROIHeads"
IN_FEATURES: ["p2", "p3", "p4", "p5"]
NUM_CLASSES: 1
ROI_BOX_HEAD:
NAME: "FastRCNNConvFCHead"
NUM_FC: 2
POOLER_RESOLUTION: 7
ROI_MASK_HEAD:
NAME: "MaskRCNNConvUpsampleHead"
NUM_CONV: 4
POOLER_RESOLUTION: 14
DATASETS:
TRAIN: ("base_coco_2017_train",)
TEST: ("base_coco_2017_val", "densepose_chimps")
CATEGORY_MAPS:
"base_coco_2017_train":
"16": 1 # bird -> person
"17": 1 # cat -> person
"18": 1 # dog -> person
"19": 1 # horse -> person
"20": 1 # sheep -> person
"21": 1 # cow -> person
"22": 1 # elephant -> person
"23": 1 # bear -> person
"24": 1 # zebra -> person
"25": 1 # girafe -> person
"base_coco_2017_val":
"16": 1 # bird -> person
"17": 1 # cat -> person
"18": 1 # dog -> person
"19": 1 # horse -> person
"20": 1 # sheep -> person
"21": 1 # cow -> person
"22": 1 # elephant -> person
"23": 1 # bear -> person
"24": 1 # zebra -> person
"25": 1 # girafe -> person
WHITELISTED_CATEGORIES:
"base_coco_2017_train":
- 1 # person
- 16 # bird
- 17 # cat
- 18 # dog
- 19 # horse
- 20 # sheep
- 21 # cow
- 22 # elephant
- 23 # bear
- 24 # zebra
- 25 # girafe
"base_coco_2017_val":
- 1 # person
- 16 # bird
- 17 # cat
- 18 # dog
- 19 # horse
- 20 # sheep
- 21 # cow
- 22 # elephant
- 23 # bear
- 24 # zebra
- 25 # girafe
SOLVER:
IMS_PER_BATCH: 16
BASE_LR: 0.02
STEPS: (60000, 80000)
MAX_ITER: 90000
INPUT:
MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
VERSION: 2

View File

@ -0,0 +1,19 @@
_BASE_: "Base-RCNN-FPN-MC.yaml"
MODEL:
WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-101.pkl"
RESNETS:
DEPTH: 101
DENSEPOSE_ON: True
ROI_HEADS:
NAME: "DensePoseROIHeads"
IN_FEATURES: ["p2", "p3", "p4", "p5"]
NUM_CLASSES: 1
ROI_DENSEPOSE_HEAD:
NAME: "DensePoseV1ConvXHead"
POOLER_TYPE: "ROIAlign"
NUM_COARSE_SEGM_CHANNELS: 2
COARSE_SEGM_TRAINED_BY_MASKS: True
INDEX_WEIGHTS: 1.0
DATASETS:
TRAIN: ("base_coco_2017_train", "densepose_coco_2014_train")
TEST: ("densepose_chimps",)

View File

@ -0,0 +1,19 @@
_BASE_: "Base-RCNN-FPN-MC.yaml"
MODEL:
WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-101.pkl"
RESNETS:
DEPTH: 101
DENSEPOSE_ON: True
ROI_HEADS:
NAME: "DensePoseROIHeads"
IN_FEATURES: ["p2", "p3", "p4", "p5"]
NUM_CLASSES: 1
ROI_DENSEPOSE_HEAD:
NAME: "DensePoseDeepLabHead"
POOLER_TYPE: "ROIAlign"
NUM_COARSE_SEGM_CHANNELS: 2
COARSE_SEGM_TRAINED_BY_MASKS: True
INDEX_WEIGHTS: 1.0
DATASETS:
TRAIN: ("base_coco_2017_train", "densepose_coco_2014_train")
TEST: ("densepose_chimps",)

View File

@ -0,0 +1,29 @@
_BASE_: "Base-RCNN-FPN-MC.yaml"
MODEL:
WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-101.pkl"
RESNETS:
DEPTH: 101
DENSEPOSE_ON: True
ROI_HEADS:
NAME: "DensePoseROIHeads"
IN_FEATURES: ["p2", "p3", "p4", "p5"]
NUM_CLASSES: 1
ROI_DENSEPOSE_HEAD:
NAME: "DensePoseDeepLabHead"
UV_CONFIDENCE:
ENABLED: True
TYPE: "iid_iso"
SEGM_CONFIDENCE:
ENABLED: True
POINT_REGRESSION_WEIGHTS: 0.0005
POOLER_TYPE: "ROIAlign"
NUM_COARSE_SEGM_CHANNELS: 2
COARSE_SEGM_TRAINED_BY_MASKS: True
INDEX_WEIGHTS: 1.0
SOLVER:
CLIP_GRADIENTS:
ENABLED: True
WARMUP_FACTOR: 0.025
DATASETS:
TRAIN: ("base_coco_2017_train", "densepose_coco_2014_train")
TEST: ("densepose_chimps",)

View File

@ -0,0 +1,27 @@
_BASE_: "Base-RCNN-FPN-MC.yaml"
MODEL:
WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-101.pkl"
RESNETS:
DEPTH: 101
DENSEPOSE_ON: True
ROI_HEADS:
NAME: "DensePoseROIHeads"
IN_FEATURES: ["p2", "p3", "p4", "p5"]
NUM_CLASSES: 1
ROI_DENSEPOSE_HEAD:
NAME: "DensePoseDeepLabHead"
UV_CONFIDENCE:
ENABLED: True
TYPE: "iid_iso"
POINT_REGRESSION_WEIGHTS: 0.0005
POOLER_TYPE: "ROIAlign"
NUM_COARSE_SEGM_CHANNELS: 2
COARSE_SEGM_TRAINED_BY_MASKS: True
INDEX_WEIGHTS: 1.0
SOLVER:
CLIP_GRADIENTS:
ENABLED: True
WARMUP_FACTOR: 0.025
DATASETS:
TRAIN: ("base_coco_2017_train", "densepose_coco_2014_train")
TEST: ("densepose_chimps",)

View File

@ -0,0 +1,29 @@
_BASE_: "Base-RCNN-FPN-MC.yaml"
MODEL:
WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-101.pkl"
RESNETS:
DEPTH: 101
DENSEPOSE_ON: True
ROI_HEADS:
NAME: "DensePoseROIHeads"
IN_FEATURES: ["p2", "p3", "p4", "p5"]
NUM_CLASSES: 1
ROI_DENSEPOSE_HEAD:
NAME: "DensePoseV1ConvXHead"
UV_CONFIDENCE:
ENABLED: True
TYPE: "iid_iso"
SEGM_CONFIDENCE:
ENABLED: True
POINT_REGRESSION_WEIGHTS: 0.0005
POOLER_TYPE: "ROIAlign"
NUM_COARSE_SEGM_CHANNELS: 2
COARSE_SEGM_TRAINED_BY_MASKS: True
INDEX_WEIGHTS: 1.0
SOLVER:
CLIP_GRADIENTS:
ENABLED: True
WARMUP_FACTOR: 0.025
DATASETS:
TRAIN: ("base_coco_2017_train", "densepose_coco_2014_train")
TEST: ("densepose_chimps",)

View File

@ -0,0 +1,27 @@
_BASE_: "Base-RCNN-FPN-MC.yaml"
MODEL:
WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-101.pkl"
RESNETS:
DEPTH: 101
DENSEPOSE_ON: True
ROI_HEADS:
NAME: "DensePoseROIHeads"
IN_FEATURES: ["p2", "p3", "p4", "p5"]
NUM_CLASSES: 1
ROI_DENSEPOSE_HEAD:
NAME: "DensePoseV1ConvXHead"
UV_CONFIDENCE:
ENABLED: True
TYPE: "iid_iso"
POINT_REGRESSION_WEIGHTS: 0.0005
POOLER_TYPE: "ROIAlign"
NUM_COARSE_SEGM_CHANNELS: 2
COARSE_SEGM_TRAINED_BY_MASKS: True
INDEX_WEIGHTS: 1.0
SOLVER:
CLIP_GRADIENTS:
ENABLED: True
WARMUP_FACTOR: 0.025
DATASETS:
TRAIN: ("base_coco_2017_train", "densepose_coco_2014_train")
TEST: ("densepose_chimps",)

View File

@ -0,0 +1,19 @@
_BASE_: "Base-RCNN-FPN-MC.yaml"
MODEL:
WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
RESNETS:
DEPTH: 50
DENSEPOSE_ON: True
ROI_HEADS:
NAME: "DensePoseROIHeads"
IN_FEATURES: ["p2", "p3", "p4", "p5"]
NUM_CLASSES: 1
ROI_DENSEPOSE_HEAD:
NAME: "DensePoseV1ConvXHead"
POOLER_TYPE: "ROIAlign"
NUM_COARSE_SEGM_CHANNELS: 2
COARSE_SEGM_TRAINED_BY_MASKS: True
INDEX_WEIGHTS: 1.0
DATASETS:
TRAIN: ("base_coco_2017_train", "densepose_coco_2014_train")
TEST: ("densepose_chimps",)

View File

@ -0,0 +1,19 @@
_BASE_: "Base-RCNN-FPN-MC.yaml"
MODEL:
WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
RESNETS:
DEPTH: 50
DENSEPOSE_ON: True
ROI_HEADS:
NAME: "DensePoseROIHeads"
IN_FEATURES: ["p2", "p3", "p4", "p5"]
NUM_CLASSES: 1
ROI_DENSEPOSE_HEAD:
NAME: "DensePoseDeepLabHead"
POOLER_TYPE: "ROIAlign"
NUM_COARSE_SEGM_CHANNELS: 2
COARSE_SEGM_TRAINED_BY_MASKS: True
INDEX_WEIGHTS: 1.0
DATASETS:
TRAIN: ("base_coco_2017_train", "densepose_coco_2014_train")
TEST: ("densepose_chimps",)

View File

@ -0,0 +1,29 @@
_BASE_: "Base-RCNN-FPN-MC.yaml"
MODEL:
WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
RESNETS:
DEPTH: 50
DENSEPOSE_ON: True
ROI_HEADS:
NAME: "DensePoseROIHeads"
IN_FEATURES: ["p2", "p3", "p4", "p5"]
NUM_CLASSES: 1
ROI_DENSEPOSE_HEAD:
NAME: "DensePoseDeepLabHead"
UV_CONFIDENCE:
ENABLED: True
TYPE: "iid_iso"
SEGM_CONFIDENCE:
ENABLED: True
POINT_REGRESSION_WEIGHTS: 0.0005
POOLER_TYPE: "ROIAlign"
NUM_COARSE_SEGM_CHANNELS: 2
COARSE_SEGM_TRAINED_BY_MASKS: True
INDEX_WEIGHTS: 1.0
SOLVER:
CLIP_GRADIENTS:
ENABLED: True
WARMUP_FACTOR: 0.025
DATASETS:
TRAIN: ("base_coco_2017_train", "densepose_coco_2014_train")
TEST: ("densepose_chimps",)

View File

@ -0,0 +1,27 @@
_BASE_: "Base-RCNN-FPN-MC.yaml"
MODEL:
WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
RESNETS:
DEPTH: 50
DENSEPOSE_ON: True
ROI_HEADS:
NAME: "DensePoseROIHeads"
IN_FEATURES: ["p2", "p3", "p4", "p5"]
NUM_CLASSES: 1
ROI_DENSEPOSE_HEAD:
NAME: "DensePoseDeepLabHead"
UV_CONFIDENCE:
ENABLED: True
TYPE: "iid_iso"
POINT_REGRESSION_WEIGHTS: 0.0005
POOLER_TYPE: "ROIAlign"
NUM_COARSE_SEGM_CHANNELS: 2
COARSE_SEGM_TRAINED_BY_MASKS: True
INDEX_WEIGHTS: 1.0
SOLVER:
CLIP_GRADIENTS:
ENABLED: True
WARMUP_FACTOR: 0.025
DATASETS:
TRAIN: ("base_coco_2017_train", "densepose_coco_2014_train")
TEST: ("densepose_chimps",)

View File

@ -0,0 +1,29 @@
_BASE_: "Base-RCNN-FPN-MC.yaml"
MODEL:
WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
RESNETS:
DEPTH: 50
DENSEPOSE_ON: True
ROI_HEADS:
NAME: "DensePoseROIHeads"
IN_FEATURES: ["p2", "p3", "p4", "p5"]
NUM_CLASSES: 1
ROI_DENSEPOSE_HEAD:
NAME: "DensePoseV1ConvXHead"
UV_CONFIDENCE:
ENABLED: True
TYPE: "iid_iso"
SEGM_CONFIDENCE:
ENABLED: True
POINT_REGRESSION_WEIGHTS: 0.0005
POOLER_TYPE: "ROIAlign"
NUM_COARSE_SEGM_CHANNELS: 2
COARSE_SEGM_TRAINED_BY_MASKS: True
INDEX_WEIGHTS: 1.0
SOLVER:
CLIP_GRADIENTS:
ENABLED: True
WARMUP_FACTOR: 0.025
DATASETS:
TRAIN: ("base_coco_2017_train", "densepose_coco_2014_train")
TEST: ("densepose_chimps",)

View File

@ -0,0 +1,30 @@
_BASE_: "Base-RCNN-FPN-MC-B.yaml"
MODEL:
WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
RESNETS:
DEPTH: 50
DENSEPOSE_ON: True
ROI_HEADS:
NAME: "DensePoseROIHeads"
IN_FEATURES: ["p2", "p3", "p4", "p5"]
NUM_CLASSES: 1
ROI_DENSEPOSE_HEAD:
NAME: "DensePoseV1ConvXHead"
UV_CONFIDENCE:
ENABLED: True
TYPE: "iid_iso"
SEGM_CONFIDENCE:
ENABLED: True
POINT_REGRESSION_WEIGHTS: 0.0005
POOLER_TYPE: "ROIAlign"
NUM_COARSE_SEGM_CHANNELS: 2
COARSE_SEGM_TRAINED_BY_MASKS: True
INDEX_WEIGHTS: 1.0
SOLVER:
CLIP_GRADIENTS:
ENABLED: True
CLIP_TYPE: "norm"
WARMUP_FACTOR: 0.025
DATASETS:
TRAIN: ("base_coco_2017_train", "densepose_coco_2014_train")
TEST: ("densepose_chimps",)

View File

@ -0,0 +1,27 @@
_BASE_: "Base-RCNN-FPN-MC.yaml"
MODEL:
WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
RESNETS:
DEPTH: 50
DENSEPOSE_ON: True
ROI_HEADS:
NAME: "DensePoseROIHeads"
IN_FEATURES: ["p2", "p3", "p4", "p5"]
NUM_CLASSES: 1
ROI_DENSEPOSE_HEAD:
NAME: "DensePoseV1ConvXHead"
UV_CONFIDENCE:
ENABLED: True
TYPE: "iid_iso"
POINT_REGRESSION_WEIGHTS: 0.0005
POOLER_TYPE: "ROIAlign"
NUM_COARSE_SEGM_CHANNELS: 2
COARSE_SEGM_TRAINED_BY_MASKS: True
INDEX_WEIGHTS: 1.0
SOLVER:
CLIP_GRADIENTS:
ENABLED: True
WARMUP_FACTOR: 0.025
DATASETS:
TRAIN: ("base_coco_2017_train", "densepose_coco_2014_train")
TEST: ("densepose_chimps",)

View File

@ -0,0 +1,7 @@
_BASE_: "Base-RCNN-FPN-MC.yaml"
MODEL:
WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
MASK_ON: False
DENSEPOSE_ON: False
RESNETS:
DEPTH: 50

View File

@ -0,0 +1,7 @@
_BASE_: "../HRNet/densepose_rcnn_HRFPN_HRNet_w32_s1x.yaml"
DATASETS:
TRAIN: ("densepose_coco_2014_minival_100",)
TEST: ("densepose_coco_2014_minival_100",)
SOLVER:
MAX_ITER: 40
STEPS: (30,)

View File

@ -0,0 +1,11 @@
_BASE_: "../Base-DensePose-RCNN-FPN.yaml"
MODEL:
WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
ROI_DENSEPOSE_HEAD:
NAME: "DensePoseDeepLabHead"
DATASETS:
TRAIN: ("densepose_coco_2014_minival_100",)
TEST: ("densepose_coco_2014_minival_100",)
SOLVER:
MAX_ITER: 40
STEPS: (30,)

View File

@ -0,0 +1,13 @@
_BASE_: "../densepose_rcnn_R_50_FPN_s1x.yaml"
MODEL:
WEIGHTS: "https://dl.fbaipublicfiles.com/densepose/densepose_rcnn_R_50_FPN_s1x/165712039/model_final_162be9.pkl"
DATASETS:
TRAIN: ()
TEST: ("densepose_coco_2014_minival_100",)
TEST:
AUG:
ENABLED: True
MIN_SIZES: (400, 500, 600, 700, 800, 900, 1000, 1100, 1200)
MAX_SIZE: 4000
FLIP: True
EXPECTED_RESULTS: [["bbox_TTA", "AP", 61.74, 0.03], ["densepose_gps_TTA", "AP", 60.22, 0.03], ["densepose_gpsm_TTA", "AP", 63.85, 0.03]]

View File

@ -0,0 +1,19 @@
_BASE_: "../Base-DensePose-RCNN-FPN.yaml"
MODEL:
WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
RESNETS:
DEPTH: 50
ROI_DENSEPOSE_HEAD:
UV_CONFIDENCE:
ENABLED: True
TYPE: "iid_iso"
POINT_REGRESSION_WEIGHTS: 0.0005
DATASETS:
TRAIN: ("densepose_coco_2014_minival_100",)
TEST: ("densepose_coco_2014_minival_100",)
SOLVER:
CLIP_GRADIENTS:
ENABLED: True
MAX_ITER: 40
STEPS: (30,)
WARMUP_FACTOR: 0.025

View File

@ -0,0 +1,19 @@
_BASE_: "../Base-DensePose-RCNN-FPN.yaml"
MODEL:
WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
RESNETS:
DEPTH: 50
ROI_DENSEPOSE_HEAD:
UV_CONFIDENCE:
ENABLED: True
TYPE: "indep_aniso"
POINT_REGRESSION_WEIGHTS: 0.0005
DATASETS:
TRAIN: ("densepose_coco_2014_minival_100",)
TEST: ("densepose_coco_2014_minival_100",)
SOLVER:
CLIP_GRADIENTS:
ENABLED: True
MAX_ITER: 40
STEPS: (30,)
WARMUP_FACTOR: 0.025

View File

@ -0,0 +1,8 @@
_BASE_: "../densepose_rcnn_R_50_FPN_s1x.yaml"
MODEL:
WEIGHTS: "https://dl.fbaipublicfiles.com/densepose/densepose_rcnn_R_50_FPN_s1x/165712039/model_final_162be9.pkl"
DATASETS:
TRAIN: ()
TEST: ("densepose_coco_2014_minival_100",)
TEST:
EXPECTED_RESULTS: [["bbox", "AP", 59.27, 0.025], ["densepose_gps", "AP", 60.11, 0.02], ["densepose_gpsm", "AP", 64.20, 0.02]]

View File

@ -0,0 +1,9 @@
_BASE_: "../Base-DensePose-RCNN-FPN.yaml"
MODEL:
WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
DATASETS:
TRAIN: ("densepose_coco_2014_minival_100",)
TEST: ("densepose_coco_2014_minival_100",)
SOLVER:
MAX_ITER: 40
STEPS: (30,)

View File

@ -0,0 +1,18 @@
_BASE_: "../Base-DensePose-RCNN-FPN.yaml"
MODEL:
WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
ROI_HEADS:
NUM_CLASSES: 1
DATASETS:
TRAIN: ("densepose_coco_2014_minival",)
TEST: ("densepose_coco_2014_minival",)
SOLVER:
CLIP_GRADIENTS:
ENABLED: True
CLIP_TYPE: norm
CLIP_VALUE: 1.0
MAX_ITER: 6000
STEPS: (5500, 5800)
TEST:
EXPECTED_RESULTS: [["bbox", "AP", 76.2477, 1.0], ["densepose_gps", "AP", 79.6090, 1.5], ["densepose_gpsm", "AP", 80.0061, 1.5]]

View File

@ -0,0 +1,171 @@
# -*- coding = utf-8 -*-
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
from detectron2.config import CfgNode as CN
def add_dataset_category_config(cfg: CN):
"""
Add config for additional category-related dataset options
- category whitelisting
- category mapping
"""
_C = cfg
_C.DATASETS.CATEGORY_MAPS = CN(new_allowed=True)
_C.DATASETS.WHITELISTED_CATEGORIES = CN(new_allowed=True)
def add_bootstrap_config(cfg: CN):
"""
"""
_C = cfg
_C.BOOTSTRAP_DATASETS = []
_C.BOOTSTRAP_MODEL = CN()
_C.BOOTSTRAP_MODEL.WEIGHTS = ""
_C.BOOTSTRAP_MODEL.DEVICE = "cuda"
def get_bootstrap_dataset_config() -> CN:
_C = CN()
_C.DATASET = ""
# ratio used to mix data loaders
_C.RATIO = 0.1
# image loader
_C.IMAGE_LOADER = CN(new_allowed=True)
_C.IMAGE_LOADER.TYPE = ""
_C.IMAGE_LOADER.BATCH_SIZE = 4
_C.IMAGE_LOADER.NUM_WORKERS = 4
# inference
_C.INFERENCE = CN()
# batch size for model inputs
_C.INFERENCE.INPUT_BATCH_SIZE = 4
# batch size to group model outputs
_C.INFERENCE.OUTPUT_BATCH_SIZE = 2
# sampled data
_C.DATA_SAMPLER = CN(new_allowed=True)
_C.DATA_SAMPLER.TYPE = ""
# filter
_C.FILTER = CN(new_allowed=True)
_C.FILTER.TYPE = ""
return _C
def load_bootstrap_config(cfg: CN):
"""
Bootstrap datasets are given as a list of `dict` that are not automatically
converted into CfgNode. This method processes all bootstrap dataset entries
and ensures that they are in CfgNode format and comply with the specification
"""
if not cfg.BOOTSTRAP_DATASETS:
return
bootstrap_datasets_cfgnodes = []
for dataset_cfg in cfg.BOOTSTRAP_DATASETS:
_C = get_bootstrap_dataset_config().clone()
_C.merge_from_other_cfg(CN(dataset_cfg))
bootstrap_datasets_cfgnodes.append(_C)
cfg.BOOTSTRAP_DATASETS = bootstrap_datasets_cfgnodes
def add_densepose_head_config(cfg: CN):
"""
Add config for densepose head.
"""
_C = cfg
_C.MODEL.DENSEPOSE_ON = True
_C.MODEL.ROI_DENSEPOSE_HEAD = CN()
_C.MODEL.ROI_DENSEPOSE_HEAD.NAME = ""
_C.MODEL.ROI_DENSEPOSE_HEAD.NUM_STACKED_CONVS = 8
# Number of parts used for point labels
_C.MODEL.ROI_DENSEPOSE_HEAD.NUM_PATCHES = 24
_C.MODEL.ROI_DENSEPOSE_HEAD.DECONV_KERNEL = 4
_C.MODEL.ROI_DENSEPOSE_HEAD.CONV_HEAD_DIM = 512
_C.MODEL.ROI_DENSEPOSE_HEAD.CONV_HEAD_KERNEL = 3
_C.MODEL.ROI_DENSEPOSE_HEAD.UP_SCALE = 2
_C.MODEL.ROI_DENSEPOSE_HEAD.HEATMAP_SIZE = 112
_C.MODEL.ROI_DENSEPOSE_HEAD.POOLER_TYPE = "ROIAlignV2"
_C.MODEL.ROI_DENSEPOSE_HEAD.POOLER_RESOLUTION = 28
_C.MODEL.ROI_DENSEPOSE_HEAD.POOLER_SAMPLING_RATIO = 2
_C.MODEL.ROI_DENSEPOSE_HEAD.NUM_COARSE_SEGM_CHANNELS = 2 # 15 or 2
# Overlap threshold for an RoI to be considered foreground (if >= FG_IOU_THRESHOLD)
_C.MODEL.ROI_DENSEPOSE_HEAD.FG_IOU_THRESHOLD = 0.7
# Loss weights for annotation masks.(14 Parts)
_C.MODEL.ROI_DENSEPOSE_HEAD.INDEX_WEIGHTS = 5.0
# Loss weights for surface parts. (24 Parts)
_C.MODEL.ROI_DENSEPOSE_HEAD.PART_WEIGHTS = 1.0
# Loss weights for UV regression.
_C.MODEL.ROI_DENSEPOSE_HEAD.POINT_REGRESSION_WEIGHTS = 0.01
# Coarse segmentation is trained using instance segmentation task data
_C.MODEL.ROI_DENSEPOSE_HEAD.COARSE_SEGM_TRAINED_BY_MASKS = False
# For Decoder
_C.MODEL.ROI_DENSEPOSE_HEAD.DECODER_ON = True
_C.MODEL.ROI_DENSEPOSE_HEAD.DECODER_NUM_CLASSES = 256
_C.MODEL.ROI_DENSEPOSE_HEAD.DECODER_CONV_DIMS = 256
_C.MODEL.ROI_DENSEPOSE_HEAD.DECODER_NORM = ""
_C.MODEL.ROI_DENSEPOSE_HEAD.DECODER_COMMON_STRIDE = 4
# For DeepLab head
_C.MODEL.ROI_DENSEPOSE_HEAD.DEEPLAB = CN()
_C.MODEL.ROI_DENSEPOSE_HEAD.DEEPLAB.NORM = "GN"
_C.MODEL.ROI_DENSEPOSE_HEAD.DEEPLAB.NONLOCAL_ON = 0
# Confidences
# Enable learning UV confidences (variances) along with the actual values
_C.MODEL.ROI_DENSEPOSE_HEAD.UV_CONFIDENCE = CN({"ENABLED": False})
# UV confidence lower bound
_C.MODEL.ROI_DENSEPOSE_HEAD.UV_CONFIDENCE.EPSILON = 0.01
# Enable learning segmentation confidences (variances) along with the actual values
_C.MODEL.ROI_DENSEPOSE_HEAD.SEGM_CONFIDENCE = CN({"ENABLED": False})
# Segmentation confidence lower bound
_C.MODEL.ROI_DENSEPOSE_HEAD.SEGM_CONFIDENCE.EPSILON = 0.01
# Statistical model type for confidence learning, possible values:
# - "iid_iso": statistically independent identically distributed residuals
# with isotropic covariance
# - "indep_aniso": statistically independent residuals with anisotropic
# covariances
_C.MODEL.ROI_DENSEPOSE_HEAD.UV_CONFIDENCE.TYPE = "iid_iso"
# List of angles for rotation in data augmentation during training
_C.INPUT.ROTATION_ANGLES = [0]
_C.TEST.AUG.ROTATION_ANGLES = () # Rotation TTA
def add_hrnet_config(cfg: CN):
"""
Add config for HRNet backbone.
"""
_C = cfg
# For HigherHRNet w32
_C.MODEL.HRNET = CN()
_C.MODEL.HRNET.STEM_INPLANES = 64
_C.MODEL.HRNET.STAGE2 = CN()
_C.MODEL.HRNET.STAGE2.NUM_MODULES = 1
_C.MODEL.HRNET.STAGE2.NUM_BRANCHES = 2
_C.MODEL.HRNET.STAGE2.BLOCK = "BASIC"
_C.MODEL.HRNET.STAGE2.NUM_BLOCKS = [4, 4]
_C.MODEL.HRNET.STAGE2.NUM_CHANNELS = [32, 64]
_C.MODEL.HRNET.STAGE2.FUSE_METHOD = "SUM"
_C.MODEL.HRNET.STAGE3 = CN()
_C.MODEL.HRNET.STAGE3.NUM_MODULES = 4
_C.MODEL.HRNET.STAGE3.NUM_BRANCHES = 3
_C.MODEL.HRNET.STAGE3.BLOCK = "BASIC"
_C.MODEL.HRNET.STAGE3.NUM_BLOCKS = [4, 4, 4]
_C.MODEL.HRNET.STAGE3.NUM_CHANNELS = [32, 64, 128]
_C.MODEL.HRNET.STAGE3.FUSE_METHOD = "SUM"
_C.MODEL.HRNET.STAGE4 = CN()
_C.MODEL.HRNET.STAGE4.NUM_MODULES = 3
_C.MODEL.HRNET.STAGE4.NUM_BRANCHES = 4
_C.MODEL.HRNET.STAGE4.BLOCK = "BASIC"
_C.MODEL.HRNET.STAGE4.NUM_BLOCKS = [4, 4, 4, 4]
_C.MODEL.HRNET.STAGE4.NUM_CHANNELS = [32, 64, 128, 256]
_C.MODEL.HRNET.STAGE4.FUSE_METHOD = "SUM"
_C.MODEL.HRNET.HRFPN = CN()
_C.MODEL.HRNET.HRFPN.OUT_CHANNELS = 256
def add_densepose_config(cfg: CN):
add_densepose_head_config(cfg)
add_hrnet_config(cfg)
add_bootstrap_config(cfg)
add_dataset_category_config(cfg)

View File

@ -0,0 +1,23 @@
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
from .build import (
build_detection_test_loader,
build_detection_train_loader,
build_combined_loader,
build_frame_selector,
build_inference_based_loaders,
has_inference_based_loaders,
BootstrapDatasetFactoryCatalog,
)
from .combined_loader import CombinedDataLoader
from .dataset_mapper import DatasetMapper
from .inference_based_loader import InferenceBasedLoader, ScoreBasedFilter
from .utils import is_relative_local_path, maybe_prepend_base_path
# ensure the builtin datasets are registered
from . import datasets
# ensure the bootstrap datasets builders are registered
from . import build
__all__ = [k for k in globals().keys() if not k.startswith("_")]

View File

@ -0,0 +1,604 @@
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
import itertools
import logging
import numpy as np
from collections import UserDict
from typing import Any, Callable, Collection, Dict, Iterable, List, Optional, Sequence
import torch
from torch.utils.data.dataset import Dataset
from detectron2.config import CfgNode
from detectron2.data.build import (
build_batch_data_loader,
load_proposals_into_dataset,
print_instances_class_histogram,
trivial_batch_collator,
)
from detectron2.data.catalog import DatasetCatalog, Metadata, MetadataCatalog
from detectron2.data.common import DatasetFromList, MapDataset
from detectron2.data.samplers import InferenceSampler, RepeatFactorTrainingSampler, TrainingSampler
from detectron2.utils.comm import get_world_size
from densepose.config import get_bootstrap_dataset_config
from .combined_loader import CombinedDataLoader, Loader
from .dataset_mapper import DatasetMapper
from .datasets.coco import DENSEPOSE_KEYS_WITHOUT_MASK as DENSEPOSE_COCO_KEYS_WITHOUT_MASK
from .datasets.coco import DENSEPOSE_MASK_KEY as DENSEPOSE_COCO_MASK_KEY
from .datasets.dataset_type import DatasetType
from .inference_based_loader import InferenceBasedLoader, ScoreBasedFilter
from .samplers import (
DensePoseConfidenceBasedSampler,
DensePoseUniformSampler,
MaskFromDensePoseSampler,
PredictionToGroundTruthSampler,
)
from .transform import ImageResizeTransform
from .video import (
FirstKFramesSelector,
FrameSelectionStrategy,
LastKFramesSelector,
RandomKFramesSelector,
VideoKeyframeDataset,
video_list_from_file,
)
__all__ = ["build_detection_train_loader", "build_detection_test_loader"]
Instance = Dict[str, Any]
InstancePredicate = Callable[[Instance], bool]
def _compute_num_images_per_worker(cfg: CfgNode):
num_workers = get_world_size()
images_per_batch = cfg.SOLVER.IMS_PER_BATCH
assert (
images_per_batch % num_workers == 0
), "SOLVER.IMS_PER_BATCH ({}) must be divisible by the number of workers ({}).".format(
images_per_batch, num_workers
)
assert (
images_per_batch >= num_workers
), "SOLVER.IMS_PER_BATCH ({}) must be larger than the number of workers ({}).".format(
images_per_batch, num_workers
)
images_per_worker = images_per_batch // num_workers
return images_per_worker
def _map_category_id_to_contiguous_id(dataset_name: str, dataset_dicts: Iterable[Instance]):
meta = MetadataCatalog.get(dataset_name)
for dataset_dict in dataset_dicts:
for ann in dataset_dict["annotations"]:
ann["category_id"] = meta.thing_dataset_id_to_contiguous_id[ann["category_id"]]
def _add_category_id_to_contiguous_id_maps_to_metadata(dataset_names: Iterable[str]):
# merge categories for all datasets
merged_categories = {}
for dataset_name in dataset_names:
meta = MetadataCatalog.get(dataset_name)
for cat_id, cat_name in meta.categories.items():
if cat_id not in merged_categories:
merged_categories[cat_id] = (cat_name, dataset_name)
continue
cat_name_other, dataset_name_other = merged_categories[cat_id]
if cat_name_other != cat_name:
raise ValueError(
f"Incompatible categories for category ID {cat_id}: "
f'dataset {dataset_name} value "{cat_name}", '
f'dataset {dataset_name_other} value "{cat_name_other}"'
)
merged_cat_id_to_cont_id = {}
for i, cat_id in enumerate(sorted(merged_categories.keys())):
merged_cat_id_to_cont_id[cat_id] = i
# add category maps to metadata
for dataset_name in dataset_names:
meta = MetadataCatalog.get(dataset_name)
categories = meta.get("categories")
meta.thing_classes = [categories[cat_id] for cat_id in sorted(categories.keys())]
meta.thing_dataset_id_to_contiguous_id = {
cat_id: merged_cat_id_to_cont_id[cat_id] for cat_id in sorted(categories.keys())
}
meta.thing_contiguous_id_to_dataset_id = {
merged_cat_id_to_cont_id[cat_id]: cat_id for cat_id in sorted(categories.keys())
}
def _maybe_create_general_keep_instance_predicate(cfg: CfgNode) -> Optional[InstancePredicate]:
def has_annotations(instance: Instance) -> bool:
return "annotations" in instance
def has_only_crowd_anotations(instance: Instance) -> bool:
for ann in instance["annotations"]:
if ann.get("is_crowd", 0) == 0:
return False
return True
def general_keep_instance_predicate(instance: Instance) -> bool:
return has_annotations(instance) and not has_only_crowd_anotations(instance)
if not cfg.DATALOADER.FILTER_EMPTY_ANNOTATIONS:
return None
return general_keep_instance_predicate
def _maybe_create_keypoints_keep_instance_predicate(cfg: CfgNode) -> Optional[InstancePredicate]:
min_num_keypoints = cfg.MODEL.ROI_KEYPOINT_HEAD.MIN_KEYPOINTS_PER_IMAGE
def has_sufficient_num_keypoints(instance: Instance) -> bool:
num_kpts = sum(
(np.array(ann["keypoints"][2::3]) > 0).sum()
for ann in instance["annotations"]
if "keypoints" in ann
)
return num_kpts >= min_num_keypoints
if cfg.MODEL.KEYPOINT_ON and (min_num_keypoints > 0):
return has_sufficient_num_keypoints
return None
def _maybe_create_mask_keep_instance_predicate(cfg: CfgNode) -> Optional[InstancePredicate]:
if not cfg.MODEL.MASK_ON:
return None
def has_mask_annotations(instance: Instance) -> bool:
return any("segmentation" in ann for ann in instance["annotations"])
return has_mask_annotations
def _maybe_create_densepose_keep_instance_predicate(cfg: CfgNode) -> Optional[InstancePredicate]:
if not cfg.MODEL.DENSEPOSE_ON:
return None
use_masks = cfg.MODEL.ROI_DENSEPOSE_HEAD.COARSE_SEGM_TRAINED_BY_MASKS
def has_densepose_annotations(instance: Instance) -> bool:
for ann in instance["annotations"]:
if all(key in ann for key in DENSEPOSE_COCO_KEYS_WITHOUT_MASK) and (
(DENSEPOSE_COCO_MASK_KEY in ann) or ("segmentation" in ann)
):
return True
if use_masks and "segmentation" in ann:
return True
return False
return has_densepose_annotations
def _maybe_create_specific_keep_instance_predicate(cfg: CfgNode) -> Optional[InstancePredicate]:
specific_predicate_creators = [
_maybe_create_keypoints_keep_instance_predicate,
_maybe_create_mask_keep_instance_predicate,
_maybe_create_densepose_keep_instance_predicate,
]
predicates = [creator(cfg) for creator in specific_predicate_creators]
predicates = [p for p in predicates if p is not None]
if not predicates:
return None
def combined_predicate(instance: Instance) -> bool:
return any(p(instance) for p in predicates)
return combined_predicate
def _get_train_keep_instance_predicate(cfg: CfgNode):
general_keep_predicate = _maybe_create_general_keep_instance_predicate(cfg)
combined_specific_keep_predicate = _maybe_create_specific_keep_instance_predicate(cfg)
def combined_general_specific_keep_predicate(instance: Instance) -> bool:
return general_keep_predicate(instance) and combined_specific_keep_predicate(instance)
if (general_keep_predicate is None) and (combined_specific_keep_predicate is None):
return None
if general_keep_predicate is None:
return combined_specific_keep_predicate
if combined_specific_keep_predicate is None:
return general_keep_predicate
return combined_general_specific_keep_predicate
def _get_test_keep_instance_predicate(cfg: CfgNode):
general_keep_predicate = _maybe_create_general_keep_instance_predicate(cfg)
return general_keep_predicate
def _maybe_filter_and_map_categories(
dataset_name: str, dataset_dicts: List[Instance]
) -> List[Instance]:
meta = MetadataCatalog.get(dataset_name)
whitelisted_categories = meta.get("whitelisted_categories")
category_map = meta.get("category_map", {})
if whitelisted_categories is None and not category_map:
return dataset_dicts
filtered_dataset_dicts = []
for dataset_dict in dataset_dicts:
anns = []
for ann in dataset_dict["annotations"]:
cat_id = ann["category_id"]
if whitelisted_categories is not None and cat_id not in whitelisted_categories:
continue
ann["category_id"] = category_map.get(cat_id, cat_id)
anns.append(ann)
dataset_dict["annotations"] = anns
filtered_dataset_dicts.append(dataset_dict)
return filtered_dataset_dicts
def _add_category_whitelists_to_metadata(cfg: CfgNode):
for dataset_name, whitelisted_cat_ids in cfg.DATASETS.WHITELISTED_CATEGORIES.items():
meta = MetadataCatalog.get(dataset_name)
meta.whitelisted_categories = whitelisted_cat_ids
logger = logging.getLogger(__name__)
logger.info(
"Whitelisted categories for dataset {}: {}".format(
dataset_name, meta.whitelisted_categories
)
)
def _add_category_maps_to_metadata(cfg: CfgNode):
for dataset_name, category_map in cfg.DATASETS.CATEGORY_MAPS.items():
category_map = {
int(cat_id_src): int(cat_id_dst) for cat_id_src, cat_id_dst in category_map.items()
}
meta = MetadataCatalog.get(dataset_name)
meta.category_map = category_map
logger = logging.getLogger(__name__)
logger.info("Category maps for dataset {}: {}".format(dataset_name, meta.category_map))
def combine_detection_dataset_dicts(
dataset_names: Collection[str],
keep_instance_predicate: Optional[InstancePredicate] = None,
proposal_files: Optional[Collection[str]] = None,
) -> List[Instance]:
"""
Load and prepare dataset dicts for training / testing
Args:
dataset_names (Collection[str]): a list of dataset names
keep_instance_predicate (Callable: Dict[str, Any] -> bool): predicate
applied to instance dicts which defines whether to keep the instance
proposal_files (Collection[str]): if given, a list of object proposal files
that match each dataset in `dataset_names`.
"""
assert len(dataset_names)
if proposal_files is None:
proposal_files = [None] * len(dataset_names)
assert len(dataset_names) == len(proposal_files)
# load annotations and dataset metadata
dataset_map = {}
for dataset_name in dataset_names:
dataset_dicts = DatasetCatalog.get(dataset_name)
dataset_map[dataset_name] = dataset_dicts
# initialize category maps
_add_category_id_to_contiguous_id_maps_to_metadata(dataset_names)
# apply category maps
all_datasets_dicts = []
for dataset_name, proposal_file in zip(dataset_names, proposal_files):
dataset_dicts = dataset_map[dataset_name]
assert len(dataset_dicts), f"Dataset '{dataset_name}' is empty!"
if proposal_file is not None:
dataset_dicts = load_proposals_into_dataset(dataset_dicts, proposal_file)
dataset_dicts = _maybe_filter_and_map_categories(dataset_name, dataset_dicts)
_map_category_id_to_contiguous_id(dataset_name, dataset_dicts)
print_instances_class_histogram(
dataset_dicts, MetadataCatalog.get(dataset_name).thing_classes
)
all_datasets_dicts.append(dataset_dicts)
if keep_instance_predicate is not None:
all_datasets_dicts_plain = [
d
for d in itertools.chain.from_iterable(all_datasets_dicts)
if keep_instance_predicate(d)
]
else:
all_datasets_dicts_plain = list(itertools.chain.from_iterable(all_datasets_dicts))
return all_datasets_dicts_plain
def build_detection_train_loader(cfg: CfgNode, mapper=None):
"""
A data loader is created in a way similar to that of Detectron2.
The main differences are:
- it allows to combine datasets with different but compatible object category sets
The data loader is created by the following steps:
1. Use the dataset names in config to query :class:`DatasetCatalog`, and obtain a list of dicts.
2. Start workers to work on the dicts. Each worker will:
* Map each metadata dict into another format to be consumed by the model.
* Batch them by simply putting dicts into a list.
The batched ``list[mapped_dict]`` is what this dataloader will return.
Args:
cfg (CfgNode): the config
mapper (callable): a callable which takes a sample (dict) from dataset and
returns the format to be consumed by the model.
By default it will be `DatasetMapper(cfg, True)`.
Returns:
an infinite iterator of training data
"""
_add_category_whitelists_to_metadata(cfg)
_add_category_maps_to_metadata(cfg)
dataset_dicts = combine_detection_dataset_dicts(
cfg.DATASETS.TRAIN,
keep_instance_predicate=_get_train_keep_instance_predicate(cfg),
proposal_files=cfg.DATASETS.PROPOSAL_FILES_TRAIN if cfg.MODEL.LOAD_PROPOSALS else None,
)
dataset = DatasetFromList(dataset_dicts, copy=False)
if mapper is None:
mapper = DatasetMapper(cfg, True)
dataset = MapDataset(dataset, mapper)
sampler_name = cfg.DATALOADER.SAMPLER_TRAIN
logger = logging.getLogger(__name__)
logger.info("Using training sampler {}".format(sampler_name))
if sampler_name == "TrainingSampler":
sampler = TrainingSampler(len(dataset))
elif sampler_name == "RepeatFactorTrainingSampler":
repeat_factors = RepeatFactorTrainingSampler.repeat_factors_from_category_frequency(
dataset_dicts, cfg.DATALOADER.REPEAT_THRESHOLD
)
sampler = RepeatFactorTrainingSampler(repeat_factors)
else:
raise ValueError("Unknown training sampler: {}".format(sampler_name))
return build_batch_data_loader(
dataset,
sampler,
cfg.SOLVER.IMS_PER_BATCH,
aspect_ratio_grouping=cfg.DATALOADER.ASPECT_RATIO_GROUPING,
num_workers=cfg.DATALOADER.NUM_WORKERS,
)
def build_detection_test_loader(cfg, dataset_name, mapper=None):
"""
Similar to `build_detection_train_loader`.
But this function uses the given `dataset_name` argument (instead of the names in cfg),
and uses batch size 1.
Args:
cfg: a detectron2 CfgNode
dataset_name (str): a name of the dataset that's available in the DatasetCatalog
mapper (callable): a callable which takes a sample (dict) from dataset
and returns the format to be consumed by the model.
By default it will be `DatasetMapper(cfg, False)`.
Returns:
DataLoader: a torch DataLoader, that loads the given detection
dataset, with test-time transformation and batching.
"""
_add_category_whitelists_to_metadata(cfg)
_add_category_maps_to_metadata(cfg)
dataset_dicts = combine_detection_dataset_dicts(
[dataset_name],
keep_instance_predicate=_get_test_keep_instance_predicate(cfg),
proposal_files=[
cfg.DATASETS.PROPOSAL_FILES_TEST[list(cfg.DATASETS.TEST).index(dataset_name)]
]
if cfg.MODEL.LOAD_PROPOSALS
else None,
)
dataset = DatasetFromList(dataset_dicts)
if mapper is None:
mapper = DatasetMapper(cfg, False)
dataset = MapDataset(dataset, mapper)
sampler = InferenceSampler(len(dataset))
# Always use 1 image per worker during inference since this is the
# standard when reporting inference time in papers.
batch_sampler = torch.utils.data.sampler.BatchSampler(sampler, 1, drop_last=False)
data_loader = torch.utils.data.DataLoader(
dataset,
num_workers=cfg.DATALOADER.NUM_WORKERS,
batch_sampler=batch_sampler,
collate_fn=trivial_batch_collator,
)
return data_loader
def build_frame_selector(cfg: CfgNode):
strategy = FrameSelectionStrategy(cfg.STRATEGY)
if strategy == FrameSelectionStrategy.RANDOM_K:
frame_selector = RandomKFramesSelector(cfg.NUM_IMAGES)
elif strategy == FrameSelectionStrategy.FIRST_K:
frame_selector = FirstKFramesSelector(cfg.NUM_IMAGES)
elif strategy == FrameSelectionStrategy.LAST_K:
frame_selector = LastKFramesSelector(cfg.NUM_IMAGES)
elif strategy == FrameSelectionStrategy.ALL:
frame_selector = None
return frame_selector
def build_transform(cfg: CfgNode, data_type: str):
if cfg.TYPE == "resize":
if data_type == "image":
return ImageResizeTransform(cfg.MIN_SIZE, cfg.MAX_SIZE)
raise ValueError(f"Unknown transform {cfg.TYPE} for data type {data_type}")
def build_combined_loader(cfg: CfgNode, loaders: Collection[Loader], ratios: Sequence[float]):
images_per_worker = _compute_num_images_per_worker(cfg)
return CombinedDataLoader(loaders, images_per_worker, ratios)
def build_bootstrap_dataset(dataset_name: str, cfg: CfgNode) -> Sequence[torch.Tensor]:
"""
Build dataset that provides data to bootstrap on
Args:
dataset_name (str): Name of the dataset, needs to have associated metadata
to load the data
cfg (CfgNode): bootstrapping config
Returns:
Sequence[Tensor] - dataset that provides image batches, Tensors of size
[N, C, H, W] of type float32
"""
logger = logging.getLogger(__name__)
meta = MetadataCatalog.get(dataset_name)
factory = BootstrapDatasetFactoryCatalog.get(meta.dataset_type)
dataset = None
if factory is not None:
dataset = factory(meta, cfg)
if dataset is None:
logger.warning(f"Failed to create dataset {dataset_name} of type {meta.dataset_type}")
return dataset
def build_data_sampler(cfg: CfgNode):
if cfg.TYPE == "densepose_uniform":
data_sampler = PredictionToGroundTruthSampler()
# transform densepose pred -> gt
data_sampler.register_sampler(
"pred_densepose",
"gt_densepose",
DensePoseUniformSampler(count_per_class=cfg.COUNT_PER_CLASS),
)
data_sampler.register_sampler("pred_densepose", "gt_masks", MaskFromDensePoseSampler())
return data_sampler
elif cfg.TYPE == "densepose_UV_confidence":
data_sampler = PredictionToGroundTruthSampler()
# transform densepose pred -> gt
data_sampler.register_sampler(
"pred_densepose",
"gt_densepose",
DensePoseConfidenceBasedSampler(
confidence_channel="sigma_2",
count_per_class=cfg.COUNT_PER_CLASS,
search_proportion=0.5,
),
)
data_sampler.register_sampler("pred_densepose", "gt_masks", MaskFromDensePoseSampler())
return data_sampler
elif cfg.TYPE == "densepose_fine_segm_confidence":
data_sampler = PredictionToGroundTruthSampler()
# transform densepose pred -> gt
data_sampler.register_sampler(
"pred_densepose",
"gt_densepose",
DensePoseConfidenceBasedSampler(
confidence_channel="fine_segm_confidence",
count_per_class=cfg.COUNT_PER_CLASS,
search_proportion=0.5,
),
)
data_sampler.register_sampler("pred_densepose", "gt_masks", MaskFromDensePoseSampler())
return data_sampler
elif cfg.TYPE == "densepose_coarse_segm_confidence":
data_sampler = PredictionToGroundTruthSampler()
# transform densepose pred -> gt
data_sampler.register_sampler(
"pred_densepose",
"gt_densepose",
DensePoseConfidenceBasedSampler(
confidence_channel="coarse_segm_confidence",
count_per_class=cfg.COUNT_PER_CLASS,
search_proportion=0.5,
),
)
data_sampler.register_sampler("pred_densepose", "gt_masks", MaskFromDensePoseSampler())
return data_sampler
raise ValueError(f"Unknown data sampler type {cfg.TYPE}")
def build_data_filter(cfg: CfgNode):
if cfg.TYPE == "detection_score":
min_score = cfg.MIN_VALUE
return ScoreBasedFilter(min_score=min_score)
raise ValueError(f"Unknown data filter type {cfg.TYPE}")
def build_inference_based_loader(
cfg: CfgNode, dataset_cfg: CfgNode, model: torch.nn.Module
) -> InferenceBasedLoader:
"""
Constructs data loader based on inference results of a model.
"""
dataset = build_bootstrap_dataset(dataset_cfg.DATASET, dataset_cfg.IMAGE_LOADER)
training_sampler = TrainingSampler(len(dataset))
data_loader = torch.utils.data.DataLoader(
dataset,
batch_size=dataset_cfg.IMAGE_LOADER.BATCH_SIZE,
sampler=training_sampler,
num_workers=dataset_cfg.IMAGE_LOADER.NUM_WORKERS,
collate_fn=trivial_batch_collator,
)
return InferenceBasedLoader(
model,
data_loader=data_loader,
data_sampler=build_data_sampler(dataset_cfg.DATA_SAMPLER),
data_filter=build_data_filter(dataset_cfg.FILTER),
shuffle=True,
batch_size=dataset_cfg.INFERENCE.OUTPUT_BATCH_SIZE,
inference_batch_size=dataset_cfg.INFERENCE.INPUT_BATCH_SIZE,
)
def has_inference_based_loaders(cfg: CfgNode) -> bool:
"""
Returns True, if at least one inferense-based loader must
be instantiated for training
"""
return len(cfg.BOOTSTRAP_DATASETS) > 0
def build_inference_based_loaders(
cfg: CfgNode, model: torch.nn.Module
) -> List[InferenceBasedLoader]:
loaders = []
ratios = []
for dataset_spec in cfg.BOOTSTRAP_DATASETS:
dataset_cfg = get_bootstrap_dataset_config().clone()
dataset_cfg.merge_from_other_cfg(CfgNode(dataset_spec))
loader = build_inference_based_loader(cfg, dataset_cfg, model)
loaders.append(loader)
ratios.append(dataset_cfg.RATIO)
return loaders, ratios
def build_video_list_dataset(meta: Metadata, cfg: CfgNode):
video_list_fpath = meta.video_list_fpath
video_base_path = meta.video_base_path
if cfg.TYPE == "video_keyframe":
frame_selector = build_frame_selector(cfg.SELECT)
transform = build_transform(cfg.TRANSFORM, data_type="image")
video_list = video_list_from_file(video_list_fpath, video_base_path)
return VideoKeyframeDataset(video_list, frame_selector, transform)
class _BootstrapDatasetFactoryCatalog(UserDict):
"""
A global dictionary that stores information about bootstrapped datasets creation functions
from metadata and config, for diverse DatasetType
"""
def register(self, dataset_type: DatasetType, factory: Callable[[Metadata, CfgNode], Dataset]):
"""
Args:
dataset_type (DatasetType): a DatasetType e.g. DatasetType.VIDEO_LIST
factory (Callable[Metadata, CfgNode]): a callable which takes Metadata and cfg
arguments and returns a dataset object.
"""
assert dataset_type not in self, "Dataset '{}' is already registered!".format(dataset_type)
self[dataset_type] = factory
BootstrapDatasetFactoryCatalog = _BootstrapDatasetFactoryCatalog()
BootstrapDatasetFactoryCatalog.register(DatasetType.VIDEO_LIST, build_video_list_dataset)

View File

@ -0,0 +1,44 @@
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
import random
from collections import deque
from typing import Any, Collection, Deque, Iterable, Iterator, List, Sequence
Loader = Iterable[Any]
def _pooled_next(iterator: Iterator[Any], pool: Deque[Any]):
if not pool:
pool.extend(next(iterator))
return pool.popleft()
class CombinedDataLoader:
"""
Combines data loaders using the provided sampling ratios
"""
BATCH_COUNT = 100
def __init__(self, loaders: Collection[Loader], batch_size: int, ratios: Sequence[float]):
self.loaders = loaders
self.batch_size = batch_size
self.ratios = ratios
def __iter__(self) -> Iterator[List[Any]]:
iters = [iter(loader) for loader in self.loaders]
indices = []
pool = [deque()] * len(iters)
# infinite iterator, as in D2
while True:
if not indices:
# just a buffer of indices, its size doesn't matter
# as long as it's a multiple of batch_size
k = self.batch_size * self.BATCH_COUNT
indices = random.choices(range(len(self.loaders)), self.ratios, k=k)
try:
batch = [_pooled_next(iters[i], pool[i]) for i in indices[: self.batch_size]]
except StopIteration:
break
indices = indices[self.batch_size :]
yield batch

View File

@ -0,0 +1,168 @@
# -*- coding: utf-8 -*-
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
import copy
import logging
from typing import Any, Dict, Tuple
import torch
from fvcore.common.file_io import PathManager
from detectron2.data import MetadataCatalog
from detectron2.data import detection_utils as utils
from detectron2.data import transforms as T
from detectron2.layers import ROIAlign
from detectron2.structures import BoxMode
from .structures import DensePoseDataRelative, DensePoseList, DensePoseTransformData
def build_augmentation(cfg, is_train):
logger = logging.getLogger(__name__)
result = utils.build_augmentation(cfg, is_train)
if is_train:
random_rotation = T.RandomRotation(
cfg.INPUT.ROTATION_ANGLES, expand=False, sample_style="choice"
)
result.append(random_rotation)
logger.info("DensePose-specific augmentation used in training: " + str(random_rotation))
return result
class DatasetMapper:
"""
A customized version of `detectron2.data.DatasetMapper`
"""
def __init__(self, cfg, is_train=True):
self.augmentation = build_augmentation(cfg, is_train)
# fmt: off
self.img_format = cfg.INPUT.FORMAT
self.mask_on = (
cfg.MODEL.MASK_ON or (
cfg.MODEL.DENSEPOSE_ON
and cfg.MODEL.ROI_DENSEPOSE_HEAD.COARSE_SEGM_TRAINED_BY_MASKS)
)
self.keypoint_on = cfg.MODEL.KEYPOINT_ON
self.densepose_on = cfg.MODEL.DENSEPOSE_ON
assert not cfg.MODEL.LOAD_PROPOSALS, "not supported yet"
# fmt: on
if self.keypoint_on and is_train:
# Flip only makes sense in training
self.keypoint_hflip_indices = utils.create_keypoint_hflip_indices(cfg.DATASETS.TRAIN)
else:
self.keypoint_hflip_indices = None
if self.densepose_on:
densepose_transform_srcs = [
MetadataCatalog.get(ds).densepose_transform_src
for ds in cfg.DATASETS.TRAIN + cfg.DATASETS.TEST
]
assert len(densepose_transform_srcs) > 0
# TODO: check that DensePose transformation data is the same for
# all the datasets. Otherwise one would have to pass DB ID with
# each entry to select proper transformation data. For now, since
# all DensePose annotated data uses the same data semantics, we
# omit this check.
densepose_transform_data_fpath = PathManager.get_local_path(densepose_transform_srcs[0])
self.densepose_transform_data = DensePoseTransformData.load(
densepose_transform_data_fpath
)
self.is_train = is_train
def __call__(self, dataset_dict):
"""
Args:
dataset_dict (dict): Metadata of one image, in Detectron2 Dataset format.
Returns:
dict: a format that builtin models in detectron2 accept
"""
dataset_dict = copy.deepcopy(dataset_dict) # it will be modified by code below
image = utils.read_image(dataset_dict["file_name"], format=self.img_format)
utils.check_image_size(dataset_dict, image)
image, transforms = T.apply_transform_gens(self.augmentation, image)
image_shape = image.shape[:2] # h, w
dataset_dict["image"] = torch.as_tensor(image.transpose(2, 0, 1).astype("float32"))
if not self.is_train:
dataset_dict.pop("annotations", None)
return dataset_dict
for anno in dataset_dict["annotations"]:
if not self.mask_on:
anno.pop("segmentation", None)
if not self.keypoint_on:
anno.pop("keypoints", None)
# USER: Implement additional transformations if you have other types of data
# USER: Don't call transpose_densepose if you don't need
annos = [
self._transform_densepose(
utils.transform_instance_annotations(
obj, transforms, image_shape, keypoint_hflip_indices=self.keypoint_hflip_indices
),
transforms,
)
for obj in dataset_dict.pop("annotations")
if obj.get("iscrowd", 0) == 0
]
if self.mask_on:
self._add_densepose_masks_as_segmentation(annos, image_shape)
instances = utils.annotations_to_instances(annos, image_shape, mask_format="bitmask")
densepose_annotations = [obj.get("densepose") for obj in annos]
if densepose_annotations and not all(v is None for v in densepose_annotations):
instances.gt_densepose = DensePoseList(
densepose_annotations, instances.gt_boxes, image_shape
)
dataset_dict["instances"] = instances[instances.gt_boxes.nonempty()]
return dataset_dict
def _transform_densepose(self, annotation, transforms):
if not self.densepose_on:
return annotation
# Handle densepose annotations
is_valid, reason_not_valid = DensePoseDataRelative.validate_annotation(annotation)
if is_valid:
densepose_data = DensePoseDataRelative(annotation, cleanup=True)
densepose_data.apply_transform(transforms, self.densepose_transform_data)
annotation["densepose"] = densepose_data
else:
# logger = logging.getLogger(__name__)
# logger.debug("Could not load DensePose annotation: {}".format(reason_not_valid))
DensePoseDataRelative.cleanup_annotation(annotation)
# NOTE: annotations for certain instances may be unavailable.
# 'None' is accepted by the DensePostList data structure.
annotation["densepose"] = None
return annotation
def _add_densepose_masks_as_segmentation(
self, annotations: Dict[str, Any], image_shape_hw: Tuple[int, int]
):
for obj in annotations:
if ("densepose" not in obj) or ("segmentation" in obj):
continue
# DP segmentation: torch.Tensor [S, S] of float32, S=256
segm_dp = torch.zeros_like(obj["densepose"].segm)
segm_dp[obj["densepose"].segm > 0] = 1
segm_h, segm_w = segm_dp.shape
bbox_segm_dp = torch.tensor((0, 0, segm_h - 1, segm_w - 1), dtype=torch.float32)
# image bbox
x0, y0, x1, y1 = (
v.item() for v in BoxMode.convert(obj["bbox"], obj["bbox_mode"], BoxMode.XYXY_ABS)
)
segm_aligned = (
ROIAlign((y1 - y0, x1 - x0), 1.0, 0, aligned=True)
.forward(segm_dp.view(1, 1, *segm_dp.shape), bbox_segm_dp)
.squeeze()
)
image_mask = torch.zeros(*image_shape_hw, dtype=torch.float32)
image_mask[y0:y1, x0:x1] = segm_aligned
# segmentation for BitMask: np.array [H, W] of np.bool
obj["segmentation"] = image_mask >= 0.5

View File

@ -0,0 +1,5 @@
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
from . import builtin # ensure the builtin datasets are registered
__all__ = [k for k in globals().keys() if "builtin" not in k and not k.startswith("_")]

View File

@ -0,0 +1,13 @@
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
from .chimpnsee import register_dataset as register_chimpnsee_dataset
from .coco import BASE_DATASETS as BASE_COCO_DATASETS
from .coco import DATASETS as COCO_DATASETS
from .coco import register_datasets as register_coco_datasets
DEFAULT_DATASETS_ROOT = "datasets"
register_coco_datasets(COCO_DATASETS, DEFAULT_DATASETS_ROOT)
register_coco_datasets(BASE_COCO_DATASETS, DEFAULT_DATASETS_ROOT)
register_chimpnsee_dataset(DEFAULT_DATASETS_ROOT)

View File

@ -0,0 +1,28 @@
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
import os
from typing import Optional
from detectron2.data import DatasetCatalog, MetadataCatalog
from ..utils import maybe_prepend_base_path
from .dataset_type import DatasetType
CHIMPNSEE_DATASET_NAME = "chimpnsee"
def register_dataset(datasets_root: Optional[os.PathLike] = None):
def empty_load_callback():
pass
video_list_fpath = maybe_prepend_base_path(
datasets_root, "chimpnsee/cdna.eva.mpg.de/video_list.txt"
)
video_base_path = maybe_prepend_base_path(datasets_root, "chimpnsee/cdna.eva.mpg.de")
DatasetCatalog.register(CHIMPNSEE_DATASET_NAME, empty_load_callback)
MetadataCatalog.get(CHIMPNSEE_DATASET_NAME).set(
dataset_type=DatasetType.VIDEO_LIST,
video_list_fpath=video_list_fpath,
video_base_path=video_base_path,
)

View File

@ -0,0 +1,324 @@
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
import contextlib
import io
import logging
import os
from collections import defaultdict
from dataclasses import dataclass
from typing import Any, Dict, Iterable, List, Optional
from fvcore.common.file_io import PathManager
from fvcore.common.timer import Timer
from detectron2.data import DatasetCatalog, MetadataCatalog
from detectron2.structures import BoxMode
from ..utils import maybe_prepend_base_path
DENSEPOSE_MASK_KEY = "dp_masks"
DENSEPOSE_KEYS_WITHOUT_MASK = ["dp_x", "dp_y", "dp_I", "dp_U", "dp_V"]
DENSEPOSE_KEYS = DENSEPOSE_KEYS_WITHOUT_MASK + [DENSEPOSE_MASK_KEY]
DENSEPOSE_METADATA_URL_PREFIX = "https://dl.fbaipublicfiles.com/densepose/data/"
@dataclass
class CocoDatasetInfo:
name: str
images_root: str
annotations_fpath: str
DATASETS = [
CocoDatasetInfo(
name="densepose_coco_2014_train",
images_root="coco/train2014",
annotations_fpath="coco/annotations/densepose_train2014.json",
),
CocoDatasetInfo(
name="densepose_coco_2014_minival",
images_root="coco/val2014",
annotations_fpath="coco/annotations/densepose_minival2014.json",
),
CocoDatasetInfo(
name="densepose_coco_2014_minival_100",
images_root="coco/val2014",
annotations_fpath="coco/annotations/densepose_minival2014_100.json",
),
CocoDatasetInfo(
name="densepose_coco_2014_valminusminival",
images_root="coco/val2014",
annotations_fpath="coco/annotations/densepose_valminusminival2014.json",
),
CocoDatasetInfo(
name="densepose_chimps",
images_root="densepose_evolution/densepose_chimps",
annotations_fpath="densepose_evolution/annotations/densepose_chimps_densepose.json",
),
CocoDatasetInfo(
name="posetrack2017_train",
images_root="posetrack2017/posetrack_data_2017",
annotations_fpath="posetrack2017/densepose_posetrack_train2017.json",
),
CocoDatasetInfo(
name="posetrack2017_val",
images_root="posetrack2017/posetrack_data_2017",
annotations_fpath="posetrack2017/densepose_posetrack_val2017.json",
),
]
BASE_DATASETS = [
CocoDatasetInfo(
name="base_coco_2017_train",
images_root="coco/train2017",
annotations_fpath="coco/annotations/instances_train2017.json",
),
CocoDatasetInfo(
name="base_coco_2017_val",
images_root="coco/val2017",
annotations_fpath="coco/annotations/instances_val2017.json",
),
CocoDatasetInfo(
name="base_coco_2017_val_100",
images_root="coco/val2017",
annotations_fpath="coco/annotations/instances_val2017_100.json",
),
]
def get_metadata(base_path: Optional[os.PathLike]) -> Dict[str, Any]:
"""
Returns metadata associated with COCO DensePose datasets
Args:
base_path: Optional[os.PathLike]
Base path used to load metadata from
Returns:
Dict[str, Any]
Metadata in the form of a dictionary
"""
meta = {
"densepose_transform_src": maybe_prepend_base_path(base_path, "UV_symmetry_transforms.mat"),
"densepose_smpl_subdiv": maybe_prepend_base_path(base_path, "SMPL_subdiv.mat"),
"densepose_smpl_subdiv_transform": maybe_prepend_base_path(
base_path, "SMPL_SUBDIV_TRANSFORM.mat"
),
}
return meta
def _load_coco_annotations(json_file: str):
"""
Load COCO annotations from a JSON file
Args:
json_file: str
Path to the file to load annotations from
Returns:
Instance of `pycocotools.coco.COCO` that provides access to annotations
data
"""
from pycocotools.coco import COCO
logger = logging.getLogger(__name__)
timer = Timer()
with contextlib.redirect_stdout(io.StringIO()):
coco_api = COCO(json_file)
if timer.seconds() > 1:
logger.info("Loading {} takes {:.2f} seconds.".format(json_file, timer.seconds()))
return coco_api
def _add_categories_metadata(dataset_name: str, categories: Dict[str, Any]):
meta = MetadataCatalog.get(dataset_name)
meta.categories = {c["id"]: c["name"] for c in categories}
logger = logging.getLogger(__name__)
logger.info("Dataset {} categories: {}".format(dataset_name, categories))
def _verify_annotations_have_unique_ids(json_file: str, anns: List[List[Dict[str, Any]]]):
if "minival" in json_file:
# Skip validation on COCO2014 valminusminival and minival annotations
# The ratio of buggy annotations there is tiny and does not affect accuracy
# Therefore we explicitly white-list them
return
ann_ids = [ann["id"] for anns_per_image in anns for ann in anns_per_image]
assert len(set(ann_ids)) == len(ann_ids), "Annotation ids in '{}' are not unique!".format(
json_file
)
def _maybe_add_bbox(obj: Dict[str, Any], ann_dict: Dict[str, Any]):
if "bbox" not in ann_dict:
return
obj["bbox"] = ann_dict["bbox"]
obj["bbox_mode"] = BoxMode.XYWH_ABS
def _maybe_add_segm(obj: Dict[str, Any], ann_dict: Dict[str, Any]):
if "segmentation" not in ann_dict:
return
segm = ann_dict["segmentation"]
if not isinstance(segm, dict):
# filter out invalid polygons (< 3 points)
segm = [poly for poly in segm if len(poly) % 2 == 0 and len(poly) >= 6]
if len(segm) == 0:
return
obj["segmentation"] = segm
def _maybe_add_keypoints(obj: Dict[str, Any], ann_dict: Dict[str, Any]):
if "keypoints" not in ann_dict:
return
keypts = ann_dict["keypoints"] # list[int]
for idx, v in enumerate(keypts):
if idx % 3 != 2:
# COCO's segmentation coordinates are floating points in [0, H or W],
# but keypoint coordinates are integers in [0, H-1 or W-1]
# Therefore we assume the coordinates are "pixel indices" and
# add 0.5 to convert to floating point coordinates.
keypts[idx] = v + 0.5
obj["keypoints"] = keypts
def _maybe_add_densepose(obj: Dict[str, Any], ann_dict: Dict[str, Any]):
for key in DENSEPOSE_KEYS:
if key in ann_dict:
obj[key] = ann_dict[key]
def _combine_images_with_annotations(
dataset_name: str,
image_root: str,
img_datas: Iterable[Dict[str, Any]],
ann_datas: Iterable[Iterable[Dict[str, Any]]],
):
ann_keys = ["iscrowd", "category_id"]
dataset_dicts = []
contains_video_frame_info = False
for img_dict, ann_dicts in zip(img_datas, ann_datas):
record = {}
record["file_name"] = os.path.join(image_root, img_dict["file_name"])
record["height"] = img_dict["height"]
record["width"] = img_dict["width"]
record["image_id"] = img_dict["id"]
record["dataset"] = dataset_name
if "frame_id" in img_dict:
record["frame_id"] = img_dict["frame_id"]
record["video_id"] = img_dict.get("vid_id", None)
contains_video_frame_info = True
objs = []
for ann_dict in ann_dicts:
assert ann_dict["image_id"] == record["image_id"]
assert ann_dict.get("ignore", 0) == 0
obj = {key: ann_dict[key] for key in ann_keys if key in ann_dict}
_maybe_add_bbox(obj, ann_dict)
_maybe_add_segm(obj, ann_dict)
_maybe_add_keypoints(obj, ann_dict)
_maybe_add_densepose(obj, ann_dict)
objs.append(obj)
record["annotations"] = objs
dataset_dicts.append(record)
if contains_video_frame_info:
create_video_frame_mapping(dataset_name, dataset_dicts)
return dataset_dicts
def create_video_frame_mapping(dataset_name, dataset_dicts):
mapping = defaultdict(dict)
for d in dataset_dicts:
video_id = d.get("video_id")
if video_id is None:
continue
mapping[video_id].update({d["frame_id"]: d["file_name"]})
MetadataCatalog.get(dataset_name).set(video_frame_mapping=mapping)
def load_coco_json(annotations_json_file: str, image_root: str, dataset_name: str):
"""
Loads a JSON file with annotations in COCO instances format.
Replaces `detectron2.data.datasets.coco.load_coco_json` to handle metadata
in a more flexible way. Postpones category mapping to a later stage to be
able to combine several datasets with different (but coherent) sets of
categories.
Args:
annotations_json_file: str
Path to the JSON file with annotations in COCO instances format.
image_root: str
directory that contains all the images
dataset_name: str
the name that identifies a dataset, e.g. "densepose_coco_2014_train"
extra_annotation_keys: Optional[List[str]]
If provided, these keys are used to extract additional data from
the annotations.
"""
coco_api = _load_coco_annotations(PathManager.get_local_path(annotations_json_file))
_add_categories_metadata(dataset_name, coco_api.loadCats(coco_api.getCatIds()))
# sort indices for reproducible results
img_ids = sorted(coco_api.imgs.keys())
# imgs is a list of dicts, each looks something like:
# {'license': 4,
# 'url': 'http://farm6.staticflickr.com/5454/9413846304_881d5e5c3b_z.jpg',
# 'file_name': 'COCO_val2014_000000001268.jpg',
# 'height': 427,
# 'width': 640,
# 'date_captured': '2013-11-17 05:57:24',
# 'id': 1268}
imgs = coco_api.loadImgs(img_ids)
logger = logging.getLogger(__name__)
logger.info("Loaded {} images in COCO format from {}".format(len(imgs), annotations_json_file))
# anns is a list[list[dict]], where each dict is an annotation
# record for an object. The inner list enumerates the objects in an image
# and the outer list enumerates over images.
anns = [coco_api.imgToAnns[img_id] for img_id in img_ids]
_verify_annotations_have_unique_ids(annotations_json_file, anns)
dataset_records = _combine_images_with_annotations(dataset_name, image_root, imgs, anns)
return dataset_records
def register_dataset(dataset_data: CocoDatasetInfo, datasets_root: Optional[os.PathLike] = None):
"""
Registers provided COCO DensePose dataset
Args:
dataset_data: CocoDatasetInfo
Dataset data
datasets_root: Optional[os.PathLike]
Datasets root folder (default: None)
"""
annotations_fpath = maybe_prepend_base_path(datasets_root, dataset_data.annotations_fpath)
images_root = maybe_prepend_base_path(datasets_root, dataset_data.images_root)
def load_annotations():
return load_coco_json(
annotations_json_file=annotations_fpath,
image_root=images_root,
dataset_name=dataset_data.name,
)
DatasetCatalog.register(dataset_data.name, load_annotations)
MetadataCatalog.get(dataset_data.name).set(
json_file=annotations_fpath,
image_root=images_root,
**get_metadata(DENSEPOSE_METADATA_URL_PREFIX)
)
def register_datasets(
datasets_data: Iterable[CocoDatasetInfo], datasets_root: Optional[os.PathLike] = None
):
"""
Registers provided COCO DensePose datasets
Args:
datasets_data: Iterable[CocoDatasetInfo]
An iterable of dataset datas
datasets_root: Optional[os.PathLike]
Datasets root folder (default: None)
"""
for dataset_data in datasets_data:
register_dataset(dataset_data, datasets_root)

View File

@ -0,0 +1,11 @@
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
from enum import Enum
class DatasetType(Enum):
"""
Dataset type, mostly used for datasets that contain data to bootstrap models on
"""
VIDEO_LIST = "video_list"

View File

@ -0,0 +1,53 @@
# -*- coding: utf-8 -*-
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
import logging
import numpy as np
from typing import Callable, List, Optional
import torch
from torch.utils.data.dataset import Dataset
from detectron2.data.detection_utils import read_image
ImageTransform = Callable[[torch.Tensor], torch.Tensor]
class ImageListDataset(Dataset):
"""
Dataset that provides images from a list.
"""
_EMPTY_IMAGE = torch.empty((1, 1, 3))
def __init__(self, image_list: List[str], transform: Optional[ImageTransform] = None):
"""
Args:
image_list (List[str]): list of paths to image files
"""
self.image_list = image_list
self.transform = transform
def __getitem__(self, idx: int) -> torch.Tensor:
"""
Gets selected images from the list
Args:
idx (int): video index in the video list file
Returns:
image (torch.Tensor): tensor of size [H, W, 3]
"""
fpath = self.image_list[idx]
try:
image = torch.from_numpy(np.ascontiguousarray(read_image(fpath, format="BGR")))
if self.transform is not None:
image = self.transform(image.unsqueeze(0))[0] # Transforms are done on batches
return image
except (OSError, RuntimeError) as e:
logger = logging.getLogger(__name__)
logger.warning(f"Error opening image file container {fpath}: {e}")
return self._EMPTY_IMAGE
def __len__(self):
return len(self.image_list)

View File

@ -0,0 +1,146 @@
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
import random
from typing import Any, Callable, Iterable, Iterator, List, Optional, Tuple
import torch
from torch import nn
SampledData = Any
ModelOutput = Any
def _grouper(iterable: Iterable[Any], n: int, fillvalue=None) -> Iterator[Tuple[Any]]:
"""
Group elements of an iterable by chunks of size `n`, e.g.
grouper(range(9), 4) ->
(0, 1, 2, 3), (4, 5, 6, 7), (8, None, None, None)
"""
it = iter(iterable)
while True:
values = []
for _ in range(n):
try:
value = next(it)
except StopIteration:
if values:
values.extend([fillvalue] * (n - len(values)))
yield tuple(values)
return
values.append(value)
yield tuple(values)
class ScoreBasedFilter:
"""
Filters entries in model output based on their scores
Discards all entries with score less than the specified minimum
"""
def __init__(self, min_score: float = 0.8):
self.min_score = min_score
def __call__(self, model_output: ModelOutput) -> ModelOutput:
for model_output_i in model_output:
instances = model_output_i["instances"]
if not instances.has("scores"):
continue
instances_filtered = instances[instances.scores >= self.min_score]
model_output_i["instances"] = instances_filtered
return model_output
class InferenceBasedLoader:
"""
Data loader based on results inferred by a model. Consists of:
- a data loader that provides batches of images
- a model that is used to infer the results
- a data sampler that converts inferred results to annotations
"""
def __init__(
self,
model: nn.Module,
data_loader: Iterable[List[torch.Tensor]],
data_sampler: Optional[Callable[[ModelOutput], List[SampledData]]] = None,
data_filter: Optional[Callable[[ModelOutput], ModelOutput]] = None,
shuffle: bool = True,
batch_size: int = 4,
inference_batch_size: int = 4,
drop_last: bool = False,
):
"""
Constructor
Args:
model (torch.nn.Module): model used to produce data
data_loader (Iterable[Tensor]): iterable that provides images
to perform inference on
data_sampler (Callable: ModelOutput -> SampledData): functor
that produces annotation data from inference results;
(optional, default: None)
data_filter (Callable: ModelOutput -> ModelOutput): filter
that selects model outputs for for further processing
(optional, default: None)
shuffle (bool): if True, the input images get shuffled
batch_size (int): batch size for the produced annotation data
inference_batch_size (int): batch size for input images
drop_last (bool): if True, drop the last batch if it is undersized
"""
self.model = model
self.model.eval()
self.data_loader = data_loader
self.data_sampler = data_sampler
self.data_filter = data_filter
self.shuffle = shuffle
self.batch_size = batch_size
self.inference_batch_size = inference_batch_size
self.drop_last = drop_last
def __iter__(self) -> Iterator[List[SampledData]]:
for batch in self.data_loader:
# batch : List[Tensor[N, C, H, W]]
# images_batch : Tensor[N, C, H, W]
# image : Tensor[C, H, W]
images = [image for images_batch in batch for image in images_batch]
if not images:
continue
if self.shuffle:
random.shuffle(images)
yield from self._produce_data(images)
def _produce_data(self, images: List[torch.Tensor]) -> Iterator[List[SampledData]]:
"""
Produce batches of data from images
Args:
images (List[Tensor]): list of images to process
Returns:
Iterator over batches of data sampled from model outputs
"""
data_batches: List[SampledData] = []
batched_images = _grouper(images, self.inference_batch_size)
for batch in batched_images:
batch = [{"image": img.to(self.model.device)} for img in batch if img is not None]
if not batch:
continue
with torch.no_grad():
model_output = self.model(batch)
for model_output_i, batch_i in zip(model_output, batch):
model_output_i["image"] = batch_i["image"]
model_output_filtered = (
model_output if self.data_filter is None else self.data_filter(model_output)
)
data = (
model_output_filtered
if self.data_sampler is None
else self.data_sampler(model_output_filtered)
)
for data_i in data:
if len(data_i["instances"]):
data_batches.append(data_i)
if len(data_batches) >= self.batch_size:
yield data_batches[: self.batch_size]
data_batches = data_batches[self.batch_size :]
if not self.drop_last and data_batches:
yield data_batches

View File

@ -0,0 +1,6 @@
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
from .densepose_uniform import DensePoseUniformSampler
from .densepose_confidence_based import DensePoseConfidenceBasedSampler
from .mask_from_densepose import MaskFromDensePoseSampler, densepose_to_mask
from .prediction_to_gt import PredictionToGroundTruthSampler

View File

@ -0,0 +1,190 @@
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
from typing import List, Optional
import torch
from torch.nn import functional as F
from detectron2.structures import BoxMode, Instances
from ..structures import (
DensePoseDataRelative,
DensePoseList,
DensePoseOutput,
resample_output_to_bbox,
)
class DensePoseBaseSampler:
"""
Base DensePose sampler to produce DensePose data from DensePose predictions.
Samples for each class are drawn according to some distribution over all pixels estimated
to belong to that class.
"""
def __init__(self, count_per_class: int = 8):
"""
Constructor
Args:
count_per_class (int): the sampler produces at most `count_per_class`
samples for each category
"""
self.count_per_class = count_per_class
def __call__(self, instances: Instances) -> DensePoseList:
"""
Convert DensePose predictions (an instance of `DensePoseOutput`)
into DensePose annotations data (an instance of `DensePoseList`)
"""
boxes_xyxy_abs = instances.pred_boxes.tensor.clone().cpu()
boxes_xywh_abs = BoxMode.convert(boxes_xyxy_abs, BoxMode.XYXY_ABS, BoxMode.XYWH_ABS)
dp_datas = []
for i, box_xywh in enumerate(boxes_xywh_abs):
labels_i, result_i = resample_output_to_bbox(
instances.pred_densepose[i], box_xywh, self._confidence_channels()
)
annotation_i = self._sample(labels_i.cpu(), result_i.cpu(), box_xywh)
annotation_i[DensePoseDataRelative.S_KEY] = self._resample_mask(
instances.pred_densepose[i]
)
dp_datas.append(DensePoseDataRelative(annotation_i))
# create densepose annotations on CPU
dp_list = DensePoseList(dp_datas, boxes_xyxy_abs, instances.image_size)
return dp_list
def _sample(
self, labels: torch.Tensor, dp_result: torch.Tensor, bbox_xywh: List[int]
) -> DensePoseDataRelative:
"""
Sample DensPoseDataRelative from estimation results
"""
annotation = {
DensePoseDataRelative.X_KEY: [],
DensePoseDataRelative.Y_KEY: [],
DensePoseDataRelative.U_KEY: [],
DensePoseDataRelative.V_KEY: [],
DensePoseDataRelative.I_KEY: [],
}
x0, y0, _, _ = bbox_xywh
n, h, w = dp_result.shape
for part_id in range(1, DensePoseDataRelative.N_PART_LABELS + 1):
# indices - tuple of 3 1D tensors of size k
# 0: index along the first dimension N
# 1: index along H dimension
# 2: index along W dimension
indices = torch.nonzero(labels.expand(n, h, w) == part_id, as_tuple=True)
# values - an array of size [n, k]
# n: number of channels (U, V, confidences)
# k: number of points labeled with part_id
values = dp_result[indices].view(n, -1)
k = values.shape[1]
count = min(self.count_per_class, k)
if count <= 0:
continue
index_sample = self._produce_index_sample(values, count)
sampled_values = values[:, index_sample]
sampled_y = indices[1][index_sample] + 0.5
sampled_x = indices[2][index_sample] + 0.5
# prepare / normalize data
x = (sampled_x / w * 256.0).cpu().tolist()
y = (sampled_y / h * 256.0).cpu().tolist()
u = sampled_values[0].clamp(0, 1).cpu().tolist()
v = sampled_values[1].clamp(0, 1).cpu().tolist()
fine_segm_labels = [part_id] * count
# extend annotations
annotation[DensePoseDataRelative.X_KEY].extend(x)
annotation[DensePoseDataRelative.Y_KEY].extend(y)
annotation[DensePoseDataRelative.U_KEY].extend(u)
annotation[DensePoseDataRelative.V_KEY].extend(v)
annotation[DensePoseDataRelative.I_KEY].extend(fine_segm_labels)
return annotation
def _confidence_channels(self) -> Optional[List[str]]:
"""
Confedence channels to be used for sampling (to be overridden in children)
"""
return None
def _produce_index_sample(self, values: torch.Tensor, count: int):
"""
Abstract method to produce a sample of indices to select data
To be implemented in descendants
Args:
values (torch.Tensor): an array of size [n, k] that contains
estimated values (U, V, confidences);
n: number of channels (U, V, confidences)
k: number of points labeled with part_id
count (int): number of samples to produce, should be positive and <= k
:w
Return:
list(int): indices of values (along axis 1) selected as a sample
"""
raise NotImplementedError
def _resample_mask(self, output: DensePoseOutput) -> torch.Tensor:
"""
Convert output mask tensors into the annotation mask tensor of size
(256, 256)
"""
sz = DensePoseDataRelative.MASK_SIZE
S = (
F.interpolate(output.S, (sz, sz), mode="bilinear", align_corners=False)
.argmax(dim=1)
.long()
)
I = (
(
F.interpolate(output.I, (sz, sz), mode="bilinear", align_corners=False).argmax(
dim=1
)
* (S > 0).long()
)
.squeeze()
.cpu()
)
# Map fine segmentation results to coarse segmentation ground truth
# TODO: extract this into separate classes
# coarse segmentation: 1 = Torso, 2 = Right Hand, 3 = Left Hand,
# 4 = Left Foot, 5 = Right Foot, 6 = Upper Leg Right, 7 = Upper Leg Left,
# 8 = Lower Leg Right, 9 = Lower Leg Left, 10 = Upper Arm Left,
# 11 = Upper Arm Right, 12 = Lower Arm Left, 13 = Lower Arm Right,
# 14 = Head
# fine segmentation: 1, 2 = Torso, 3 = Right Hand, 4 = Left Hand,
# 5 = Left Foot, 6 = Right Foot, 7, 9 = Upper Leg Right,
# 8, 10 = Upper Leg Left, 11, 13 = Lower Leg Right,
# 12, 14 = Lower Leg Left, 15, 17 = Upper Arm Left,
# 16, 18 = Upper Arm Right, 19, 21 = Lower Arm Left,
# 20, 22 = Lower Arm Right, 23, 24 = Head
FINE_TO_COARSE_SEGMENTATION = {
1: 1,
2: 1,
3: 2,
4: 3,
5: 4,
6: 5,
7: 6,
8: 7,
9: 6,
10: 7,
11: 8,
12: 9,
13: 8,
14: 9,
15: 10,
16: 11,
17: 10,
18: 11,
19: 12,
20: 13,
21: 12,
22: 13,
23: 14,
24: 14,
}
mask = torch.zeros((sz, sz), dtype=torch.int64, device=torch.device("cpu"))
for i in range(DensePoseDataRelative.N_PART_LABELS):
mask[I == i + 1] = FINE_TO_COARSE_SEGMENTATION[i + 1]
return mask

View File

@ -0,0 +1,91 @@
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
import random
from typing import List, Optional
import torch
from .densepose_base import DensePoseBaseSampler
class DensePoseConfidenceBasedSampler(DensePoseBaseSampler):
"""
Samples DensePose data from DensePose predictions.
Samples for each class are drawn using confidence value estimates.
"""
def __init__(
self,
confidence_channel: str,
count_per_class: int = 8,
search_count_multiplier: Optional[float] = None,
search_proportion: Optional[float] = None,
):
"""
Constructor
Args:
confidence_channel (str): confidence channel to use for sampling;
possible values:
"sigma_2": confidences for UV values
"fine_segm_confidence": confidences for fine segmentation
"coarse_segm_confidence": confidences for coarse segmentation
(default: "sigma_2")
count_per_class (int): the sampler produces at most `count_per_class`
samples for each category (default: 8)
search_count_multiplier (float or None): if not None, the total number
of the most confident estimates of a given class to consider is
defined as `min(search_count_multiplier * count_per_class, N)`,
where `N` is the total number of estimates of the class; cannot be
specified together with `search_proportion` (default: None)
search_proportion (float or None): if not None, the total number of the
of the most confident estimates of a given class to consider is
defined as `min(max(search_proportion * N, count_per_class), N)`,
where `N` is the total number of estimates of the class; cannot be
specified together with `search_count_multiplier` (default: None)
"""
super().__init__(count_per_class)
self.confidence_channel = confidence_channel
self.search_count_multiplier = search_count_multiplier
self.search_proportion = search_proportion
assert (search_count_multiplier is None) or (search_proportion is None), (
f"Cannot specify both search_count_multiplier (={search_count_multiplier})"
f"and search_proportion (={search_proportion})"
)
def _confidence_channels(self) -> Optional[List[str]]:
"""
Confedence channels to be used for sampling (to be overridden in children)
"""
return [self.confidence_channel]
def _produce_index_sample(self, values: torch.Tensor, count: int):
"""
Produce a sample of indices to select data based on confidences
Args:
values (torch.Tensor): an array of size [n, k] that contains
estimated values (U, V, confidences);
n: number of channels (U, V, confidences)
k: number of points labeled with part_id
count (int): number of samples to produce, should be positive and <= k
Return:
list(int): indices of values (along axis 1) selected as a sample
"""
k = values.shape[1]
if k == count:
index_sample = list(range(k))
else:
# take the best count * search_count_multiplier pixels,
# sample from them uniformly
# (here best = smallest variance)
_, sorted_confidence_indices = torch.sort(values[2])
if self.search_count_multiplier is not None:
search_count = min(int(count * self.search_count_multiplier), k)
elif self.search_proportion is not None:
search_count = min(max(int(k * self.search_proportion), count), k)
else:
search_count = min(count, k)
sample_from_top = random.sample(range(search_count), count)
index_sample = sorted_confidence_indices[:search_count][sample_from_top]
return index_sample

View File

@ -0,0 +1,41 @@
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
import random
import torch
from .densepose_base import DensePoseBaseSampler
class DensePoseUniformSampler(DensePoseBaseSampler):
"""
Samples DensePose data from DensePose predictions.
Samples for each class are drawn uniformly over all pixels estimated
to belong to that class.
"""
def __init__(self, count_per_class: int = 8):
"""
Constructor
Args:
count_per_class (int): the sampler produces at most `count_per_class`
samples for each category
"""
super().__init__(count_per_class)
def _produce_index_sample(self, values: torch.Tensor, count: int):
"""
Produce a uniform sample of indices to select data
Args:
values (torch.Tensor): an array of size [n, k] that contains
estimated values (U, V, confidences);
n: number of channels (U, V, confidences)
k: number of points labeled with part_id
count (int): number of samples to produce, should be positive and <= k
Return:
list(int): indices of values (along axis 1) selected as a sample
"""
k = values.shape[1]
return random.sample(range(k), count)

View File

@ -0,0 +1,59 @@
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
import torch
from detectron2.structures import BitMasks, BoxMode, Instances
from ..structures import resample_output_to_bbox
def densepose_to_mask(instances: Instances) -> BitMasks:
"""
Produce masks from DensePose predictions
DensePose predictions for a given image, stored in `pred_densepose` field,
are instances of DensePoseOutput. This sampler takes
`S` and `I` output tensors (coarse and fine segmentation) and converts
then to a mask tensor, which is a bool tensor of the size of the input
image
Args:
instances (Instances): predicted results, expected to have `pred_densepose` field
that contains `DensePoseOutput` objects
Returns:
`BitMasks` instance with boolean tensors of the size of the input image that have non-zero
values at pixels that are estimated to belong to the detected objects
"""
H, W = instances.image_size
boxes_xyxy_abs = instances.pred_boxes.tensor.clone().cpu()
boxes_xywh_abs = BoxMode.convert(boxes_xyxy_abs, BoxMode.XYXY_ABS, BoxMode.XYWH_ABS)
N = len(boxes_xywh_abs)
gt_masks = torch.zeros((N, H, W), dtype=torch.bool, device=torch.device("cpu"))
for i, box_xywh in enumerate(boxes_xywh_abs):
labels_i, _ = resample_output_to_bbox(instances.pred_densepose[i], box_xywh)
x, y, w, h = box_xywh.long().tolist()
gt_masks[i, y : y + h, x : x + w] = labels_i.cpu() > 0
return BitMasks(gt_masks)
class MaskFromDensePoseSampler:
"""
Produce mask GT from DensePose predictions
DensePose prediction is an instance of DensePoseOutput. This sampler takes
`S` and `I` output tensors (coarse and fine segmentation) and converts
then to a mask tensor, which is a bool tensor of the size of the input
image
"""
def __call__(self, instances: Instances) -> BitMasks:
"""
Converts predicted data from `instances` into the GT mask data
Args:
instances (Instances): predicted results, expected to have `pred_densepose` field
Returns:
Boolean Tensor of the size of the input image that has non-zero
values at pixels that are estimated to belong to the detected object
"""
return densepose_to_mask(instances)

View File

@ -0,0 +1,80 @@
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
from dataclasses import dataclass
from typing import Any, Callable, Dict, Optional
from detectron2.structures import Instances
ModelOutput = Dict[str, Any]
SampledData = Dict[str, Any]
@dataclass
class _Sampler:
"""
Sampler registry entry that contains:
- src (str): source field to sample from (deleted after sampling)
- dst (Optional[str]): destination field to sample to, if not None
- func (Optional[Callable: Any -> Any]): function that performs sampling,
if None, reference copy is performed
"""
src: str
dst: Optional[str]
func: Optional[Callable[[Any], Any]]
class PredictionToGroundTruthSampler:
"""
Sampler implementation that converts predictions to GT using registered
samplers for different fields of `Instances`.
"""
def __init__(self, dataset_name: str = ""):
self.dataset_name = dataset_name
self._samplers = {}
self.register_sampler("pred_boxes", "gt_boxes", None)
self.register_sampler("pred_classes", "gt_classes", None)
self.register_sampler("scores")
def __call__(self, model_output: ModelOutput) -> SampledData:
"""
Transform model output into ground truth data through sampling
Args:
model_output (Dict[str, Any]): model output
Returns:
Dict[str, Any]: sampled data
"""
for model_output_i in model_output:
instances: Instances = model_output_i["instances"]
# transform data in each field
for _, sampler in self._samplers.items():
if not instances.has(sampler.src) or sampler.dst is None:
continue
if sampler.func is None:
instances.set(sampler.dst, instances.get(sampler.src))
else:
instances.set(sampler.dst, sampler.func(instances))
# delete model output data that was transformed
for _, sampler in self._samplers.items():
if sampler.src != sampler.dst and instances.has(sampler.src):
instances.remove(sampler.src)
model_output_i["dataset"] = self.dataset_name
return model_output
def register_sampler(
self,
prediction_attr: str,
gt_attr: Optional[str] = None,
func: Optional[Callable[[Any], Any]] = None,
):
"""
Register sampler for a field
Args:
prediction_attr (str): field to replace with a sampled value
gt_attr (Optional[str]): field to store the sampled value to, if not None
func (Optional[Callable: Any -> Any]): sampler function
"""
self._samplers[prediction_attr] = _Sampler(src=prediction_attr, dst=gt_attr, func=func)

View File

@ -0,0 +1,703 @@
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
import base64
import numpy as np
from io import BytesIO
from typing import BinaryIO, Dict, List, Optional, Tuple, Union
import torch
from PIL import Image
from torch.nn import functional as F
class DensePoseTransformData(object):
# Horizontal symmetry label transforms used for horizontal flip
MASK_LABEL_SYMMETRIES = [0, 1, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 14]
# fmt: off
POINT_LABEL_SYMMETRIES = [ 0, 1, 2, 4, 3, 6, 5, 8, 7, 10, 9, 12, 11, 14, 13, 16, 15, 18, 17, 20, 19, 22, 21, 24, 23] # noqa
# fmt: on
def __init__(self, uv_symmetries: Dict[str, torch.Tensor], device: torch.device):
self.mask_label_symmetries = DensePoseTransformData.MASK_LABEL_SYMMETRIES
self.point_label_symmetries = DensePoseTransformData.POINT_LABEL_SYMMETRIES
self.uv_symmetries = uv_symmetries
self.device = torch.device("cpu")
def to(self, device: torch.device, copy: bool = False) -> "DensePoseTransformData":
"""
Convert transform data to the specified device
Args:
device (torch.device): device to convert the data to
copy (bool): flag that specifies whether to copy or to reference the data
in case the device is the same
Return:
An instance of `DensePoseTransformData` with data stored on the specified device
"""
if self.device == device and not copy:
return self
uv_symmetry_map = {}
for key in self.uv_symmetries:
uv_symmetry_map[key] = self.uv_symmetries[key].to(device=device, copy=copy)
return DensePoseTransformData(uv_symmetry_map, device)
@staticmethod
def load(io: Union[str, BinaryIO]):
"""
Args:
io: (str or binary file-like object): input file to load data from
Returns:
An instance of `DensePoseTransformData` with transforms loaded from the file
"""
import scipy.io
uv_symmetry_map = scipy.io.loadmat(io)
uv_symmetry_map_torch = {}
for key in ["U_transforms", "V_transforms"]:
uv_symmetry_map_torch[key] = []
map_src = uv_symmetry_map[key]
map_dst = uv_symmetry_map_torch[key]
for i in range(map_src.shape[1]):
map_dst.append(torch.from_numpy(map_src[0, i]).to(dtype=torch.float))
uv_symmetry_map_torch[key] = torch.stack(map_dst, dim=0)
transform_data = DensePoseTransformData(uv_symmetry_map_torch, device=torch.device("cpu"))
return transform_data
class DensePoseDataRelative(object):
"""
Dense pose relative annotations that can be applied to any bounding box:
x - normalized X coordinates [0, 255] of annotated points
y - normalized Y coordinates [0, 255] of annotated points
i - body part labels 0,...,24 for annotated points
u - body part U coordinates [0, 1] for annotated points
v - body part V coordinates [0, 1] for annotated points
segm - 256x256 segmentation mask with values 0,...,14
To obtain absolute x and y data wrt some bounding box one needs to first
divide the data by 256, multiply by the respective bounding box size
and add bounding box offset:
x_img = x0 + x_norm * w / 256.0
y_img = y0 + y_norm * h / 256.0
Segmentation masks are typically sampled to get image-based masks.
"""
# Key for normalized X coordinates in annotation dict
X_KEY = "dp_x"
# Key for normalized Y coordinates in annotation dict
Y_KEY = "dp_y"
# Key for U part coordinates in annotation dict
U_KEY = "dp_U"
# Key for V part coordinates in annotation dict
V_KEY = "dp_V"
# Key for I point labels in annotation dict
I_KEY = "dp_I"
# Key for segmentation mask in annotation dict
S_KEY = "dp_masks"
# Number of body parts in segmentation masks
N_BODY_PARTS = 14
# Number of parts in point labels
N_PART_LABELS = 24
MASK_SIZE = 256
def __init__(self, annotation, cleanup=False):
is_valid, reason_not_valid = DensePoseDataRelative.validate_annotation(annotation)
assert is_valid, "Invalid DensePose annotations: {}".format(reason_not_valid)
self.x = torch.as_tensor(annotation[DensePoseDataRelative.X_KEY])
self.y = torch.as_tensor(annotation[DensePoseDataRelative.Y_KEY])
self.i = torch.as_tensor(annotation[DensePoseDataRelative.I_KEY])
self.u = torch.as_tensor(annotation[DensePoseDataRelative.U_KEY])
self.v = torch.as_tensor(annotation[DensePoseDataRelative.V_KEY])
self.segm = DensePoseDataRelative.extract_segmentation_mask(annotation)
self.device = torch.device("cpu")
if cleanup:
DensePoseDataRelative.cleanup_annotation(annotation)
def to(self, device):
if self.device == device:
return self
new_data = DensePoseDataRelative.__new__(DensePoseDataRelative)
new_data.x = self.x
new_data.x = self.x.to(device)
new_data.y = self.y.to(device)
new_data.i = self.i.to(device)
new_data.u = self.u.to(device)
new_data.v = self.v.to(device)
new_data.segm = self.segm.to(device)
new_data.device = device
return new_data
@staticmethod
def extract_segmentation_mask(annotation):
poly_specs = annotation[DensePoseDataRelative.S_KEY]
if isinstance(poly_specs, torch.Tensor):
# data is already given as mask tensors, no need to decode
return poly_specs
import pycocotools.mask as mask_utils
segm = torch.zeros((DensePoseDataRelative.MASK_SIZE,) * 2, dtype=torch.float32)
for i in range(DensePoseDataRelative.N_BODY_PARTS):
poly_i = poly_specs[i]
if poly_i:
mask_i = mask_utils.decode(poly_i)
segm[mask_i > 0] = i + 1
return segm
@staticmethod
def validate_annotation(annotation):
for key in [
DensePoseDataRelative.X_KEY,
DensePoseDataRelative.Y_KEY,
DensePoseDataRelative.I_KEY,
DensePoseDataRelative.U_KEY,
DensePoseDataRelative.V_KEY,
DensePoseDataRelative.S_KEY,
]:
if key not in annotation:
return False, "no {key} data in the annotation".format(key=key)
return True, None
@staticmethod
def cleanup_annotation(annotation):
for key in [
DensePoseDataRelative.X_KEY,
DensePoseDataRelative.Y_KEY,
DensePoseDataRelative.I_KEY,
DensePoseDataRelative.U_KEY,
DensePoseDataRelative.V_KEY,
DensePoseDataRelative.S_KEY,
]:
if key in annotation:
del annotation[key]
def apply_transform(self, transforms, densepose_transform_data):
self._transform_pts(transforms, densepose_transform_data)
self._transform_segm(transforms, densepose_transform_data)
def _transform_pts(self, transforms, dp_transform_data):
import detectron2.data.transforms as T
# NOTE: This assumes that HorizFlipTransform is the only one that does flip
do_hflip = sum(isinstance(t, T.HFlipTransform) for t in transforms.transforms) % 2 == 1
if do_hflip:
self.x = self.segm.size(1) - self.x
self._flip_iuv_semantics(dp_transform_data)
for t in transforms.transforms:
if isinstance(t, T.RotationTransform):
xy_scale = np.array((t.w, t.h)) / DensePoseDataRelative.MASK_SIZE
xy = t.apply_coords(np.stack((self.x, self.y), axis=1) * xy_scale)
self.x, self.y = torch.tensor(xy / xy_scale, dtype=self.x.dtype).T
def _flip_iuv_semantics(self, dp_transform_data: DensePoseTransformData) -> None:
i_old = self.i.clone()
uv_symmetries = dp_transform_data.uv_symmetries
pt_label_symmetries = dp_transform_data.point_label_symmetries
for i in range(self.N_PART_LABELS):
if i + 1 in i_old:
annot_indices_i = i_old == i + 1
if pt_label_symmetries[i + 1] != i + 1:
self.i[annot_indices_i] = pt_label_symmetries[i + 1]
u_loc = (self.u[annot_indices_i] * 255).long()
v_loc = (self.v[annot_indices_i] * 255).long()
self.u[annot_indices_i] = uv_symmetries["U_transforms"][i][v_loc, u_loc].to(
device=self.u.device
)
self.v[annot_indices_i] = uv_symmetries["V_transforms"][i][v_loc, u_loc].to(
device=self.v.device
)
def _transform_segm(self, transforms, dp_transform_data):
import detectron2.data.transforms as T
# NOTE: This assumes that HorizFlipTransform is the only one that does flip
do_hflip = sum(isinstance(t, T.HFlipTransform) for t in transforms.transforms) % 2 == 1
if do_hflip:
self.segm = torch.flip(self.segm, [1])
self._flip_segm_semantics(dp_transform_data)
for t in transforms.transforms:
if isinstance(t, T.RotationTransform):
self._transform_segm_rotation(t)
def _flip_segm_semantics(self, dp_transform_data):
old_segm = self.segm.clone()
mask_label_symmetries = dp_transform_data.mask_label_symmetries
for i in range(self.N_BODY_PARTS):
if mask_label_symmetries[i + 1] != i + 1:
self.segm[old_segm == i + 1] = mask_label_symmetries[i + 1]
def _transform_segm_rotation(self, rotation):
self.segm = F.interpolate(self.segm[None, None, :], (rotation.h, rotation.w)).numpy()
self.segm = torch.tensor(rotation.apply_segmentation(self.segm[0, 0]))[None, None, :]
self.segm = F.interpolate(self.segm, [DensePoseDataRelative.MASK_SIZE] * 2)[0, 0]
def normalized_coords_transform(x0, y0, w, h):
"""
Coordinates transform that maps top left corner to (-1, -1) and bottom
right corner to (1, 1). Used for torch.grid_sample to initialize the
grid
"""
def f(p):
return (2 * (p[0] - x0) / w - 1, 2 * (p[1] - y0) / h - 1)
return f
class DensePoseOutput(object):
def __init__(self, S, I, U, V, confidences):
"""
Args:
S (`torch.Tensor`): coarse segmentation tensor of size (N, A, H, W)
I (`torch.Tensor`): fine segmentation tensor of size (N, C, H, W)
U (`torch.Tensor`): U coordinates for each fine segmentation label of size (N, C, H, W)
V (`torch.Tensor`): V coordinates for each fine segmentation label of size (N, C, H, W)
confidences (dict of str -> `torch.Tensor`) estimated confidence model parameters
"""
self.S = S
self.I = I # noqa: E741
self.U = U
self.V = V
self.confidences = confidences
self._check_output_dims(S, I, U, V)
def _check_output_dims(self, S, I, U, V):
assert (
len(S.size()) == 4
), "Segmentation output should have 4 " "dimensions (NCHW), but has size {}".format(
S.size()
)
assert (
len(I.size()) == 4
), "Segmentation output should have 4 " "dimensions (NCHW), but has size {}".format(
S.size()
)
assert (
len(U.size()) == 4
), "Segmentation output should have 4 " "dimensions (NCHW), but has size {}".format(
S.size()
)
assert (
len(V.size()) == 4
), "Segmentation output should have 4 " "dimensions (NCHW), but has size {}".format(
S.size()
)
assert len(S) == len(I), (
"Number of output segmentation planes {} "
"should be equal to the number of output part index "
"planes {}".format(len(S), len(I))
)
assert S.size()[2:] == I.size()[2:], (
"Output segmentation plane size {} "
"should be equal to the output part index "
"plane size {}".format(S.size()[2:], I.size()[2:])
)
assert I.size() == U.size(), (
"Part index output shape {} "
"should be the same as U coordinates output shape {}".format(I.size(), U.size())
)
assert I.size() == V.size(), (
"Part index output shape {} "
"should be the same as V coordinates output shape {}".format(I.size(), V.size())
)
def resize(self, image_size_hw):
# do nothing - outputs are invariant to resize
pass
def _crop(self, S, I, U, V, bbox_old_xywh, bbox_new_xywh):
"""
Resample S, I, U, V from bbox_old to the cropped bbox_new
"""
x0old, y0old, wold, hold = bbox_old_xywh
x0new, y0new, wnew, hnew = bbox_new_xywh
tr_coords = normalized_coords_transform(x0old, y0old, wold, hold)
topleft = (x0new, y0new)
bottomright = (x0new + wnew, y0new + hnew)
topleft_norm = tr_coords(topleft)
bottomright_norm = tr_coords(bottomright)
hsize = S.size(1)
wsize = S.size(2)
grid = torch.meshgrid(
torch.arange(
topleft_norm[1],
bottomright_norm[1],
(bottomright_norm[1] - topleft_norm[1]) / hsize,
)[:hsize],
torch.arange(
topleft_norm[0],
bottomright_norm[0],
(bottomright_norm[0] - topleft_norm[0]) / wsize,
)[:wsize],
)
grid = torch.stack(grid, dim=2).to(S.device)
assert (
grid.size(0) == hsize
), "Resampled grid expected " "height={}, actual height={}".format(hsize, grid.size(0))
assert grid.size(1) == wsize, "Resampled grid expected " "width={}, actual width={}".format(
wsize, grid.size(1)
)
S_new = F.grid_sample(
S.unsqueeze(0),
torch.unsqueeze(grid, 0),
mode="bilinear",
padding_mode="border",
align_corners=True,
).squeeze(0)
I_new = F.grid_sample(
I.unsqueeze(0),
torch.unsqueeze(grid, 0),
mode="bilinear",
padding_mode="border",
align_corners=True,
).squeeze(0)
U_new = F.grid_sample(
U.unsqueeze(0),
torch.unsqueeze(grid, 0),
mode="bilinear",
padding_mode="border",
align_corners=True,
).squeeze(0)
V_new = F.grid_sample(
V.unsqueeze(0),
torch.unsqueeze(grid, 0),
mode="bilinear",
padding_mode="border",
align_corners=True,
).squeeze(0)
return S_new, I_new, U_new, V_new
def crop(self, indices_cropped, bboxes_old, bboxes_new):
"""
Crop outputs for selected bounding boxes to the new bounding boxes.
"""
# VK: cropping is ignored for now
# for i, ic in enumerate(indices_cropped):
# self.S[ic], self.I[ic], self.U[ic], self.V[ic] = \
# self._crop(self.S[ic], self.I[ic], self.U[ic], self.V[ic],
# bboxes_old[i], bboxes_new[i])
pass
def hflip(self, transform_data: DensePoseTransformData) -> None:
"""
Change S, I, U and V to take into account a Horizontal flip.
"""
if self.I.shape[0] > 0:
for el in "SIUV":
self.__dict__[el] = torch.flip(self.__dict__[el], [3])
for key in self.confidences:
self.confidences[key] = torch.flip(self.confidences[key], [3])
self._flip_iuv_semantics_tensor(transform_data)
self._flip_segm_semantics_tensor(transform_data)
def _flip_iuv_semantics_tensor(self, dp_transform_data: DensePoseTransformData) -> None:
point_label_symmetries = dp_transform_data.point_label_symmetries
uv_symmetries = dp_transform_data.uv_symmetries
N, C, H, W = self.U.shape
u_loc = (self.U[:, 1:, :, :].clamp(0, 1) * 255).long()
v_loc = (self.V[:, 1:, :, :].clamp(0, 1) * 255).long()
Iindex = torch.arange(C - 1, device=self.U.device)[None, :, None, None].expand(
N, C - 1, H, W
)
self.U[:, 1:, :, :] = uv_symmetries["U_transforms"][Iindex, v_loc, u_loc]
self.V[:, 1:, :, :] = uv_symmetries["V_transforms"][Iindex, v_loc, u_loc]
for el in "IUV":
self.__dict__[el] = self.__dict__[el][:, point_label_symmetries, :, :]
def _flip_segm_semantics_tensor(self, dp_transform_data):
if self.S.shape[1] == DensePoseDataRelative.N_BODY_PARTS + 1:
self.S = self.S[:, dp_transform_data.mask_label_symmetries, :, :]
def to_result(self, boxes_xywh):
"""
Convert DensePose outputs to results format. Results are more compact,
but cannot be resampled any more
"""
result = DensePoseResult(boxes_xywh, self.S, self.I, self.U, self.V)
return result
def __getitem__(self, item):
if isinstance(item, int):
S_selected = self.S[item].unsqueeze(0)
I_selected = self.I[item].unsqueeze(0)
U_selected = self.U[item].unsqueeze(0)
V_selected = self.V[item].unsqueeze(0)
conf_selected = {}
for key in self.confidences:
conf_selected[key] = self.confidences[key][item].unsqueeze(0)
else:
S_selected = self.S[item]
I_selected = self.I[item]
U_selected = self.U[item]
V_selected = self.V[item]
conf_selected = {}
for key in self.confidences:
conf_selected[key] = self.confidences[key][item]
return DensePoseOutput(S_selected, I_selected, U_selected, V_selected, conf_selected)
def __str__(self):
s = "DensePoseOutput S {}, I {}, U {}, V {}".format(
list(self.S.size()), list(self.I.size()), list(self.U.size()), list(self.V.size())
)
s_conf = "confidences: [{}]".format(
", ".join([f"{key} {list(self.confidences[key].size())}" for key in self.confidences])
)
return ", ".join([s, s_conf])
def __len__(self):
return self.S.size(0)
def resample_output_to_bbox(
output: DensePoseOutput, bbox_xywh_abs: List[int], confidences: Optional[List[str]] = None
) -> Tuple[torch.Tensor, torch.Tensor]:
"""
Convert DensePose output of size [1, C, S, S] into DensePose results [D, H_i, W_i],
where `i` is detection index and `D == 2 + len(confidences)`. This conversion:
- resamples data to the detection bounding box size (H_i, W_i),
- sets label for each pixel of the bounding box as the `argmax` of scores,
- assigns values (U, V, confidences) based on label and resampled data
Args:
output (DensePoseOutput): outputs of the DensePose model
bbox_xywh_abs (List[int]): bounding box, a list of 4 integer values XYWH
confidences (List[str]): optional list of `str` that specifies confidence
channels to be resampled and added to the results
Results:
labels (torch.Tensor): tensor [1, H_i, W_i] of `torch.uint8` containing fine
segmentation labels of each pixel
data (torch.Tensor): tensor [D, H_i, W_i] of `torch.float32` containing
for each pixel the estimated U, V coordinates and the requested
confidence values in the order that corresponds to `confidences`
"""
x, y, w, h = bbox_xywh_abs
w = max(int(w), 1)
h = max(int(h), 1)
N_out = 2 if confidences is None else 2 + len(confidences)
device = output.U.device
data = torch.zeros([N_out, h, w], dtype=torch.float32, device=device)
# coarse segmentation
assert (
len(output.S.size()) == 4
), "AnnIndex tensor size should have {} dimensions but has {}".format(4, len(output.S.size()))
s_bbox = F.interpolate(output.S, (h, w), mode="bilinear", align_corners=False).argmax(dim=1)
# fine segmentation
assert (
len(output.I.size()) == 4
), "IndexUV tensor size should have {} dimensions but has {}".format(4, len(output.S.size()))
labels = (
F.interpolate(output.I, (h, w), mode="bilinear", align_corners=False).argmax(dim=1)
* (s_bbox > 0).long()
).squeeze(0)
# U
assert len(output.U.size()) == 4, "U tensor size should have {} dimensions but has {}".format(
4, len(output.U.size())
)
u_bbox = F.interpolate(output.U, (h, w), mode="bilinear", align_corners=False)
# V
assert len(output.V.size()) == 4, "V tensor size should have {} dimensions but has {}".format(
4, len(output.V.size())
)
v_bbox = F.interpolate(output.V, (h, w), mode="bilinear", align_corners=False)
# confidences
if confidences is not None:
resampled_confidence = {}
for key in output.confidences:
resampled_confidence[key] = F.interpolate(
output.confidences[key], (h, w), mode="bilinear", align_corners=False
)
# assign data from channels that correspond to the labels
for part_id in range(1, u_bbox.size(1)):
data[0][labels == part_id] = u_bbox[0, part_id][labels == part_id]
data[1][labels == part_id] = v_bbox[0, part_id][labels == part_id]
if confidences is None:
continue
for i, key in enumerate(confidences):
if resampled_confidence[key].size(1) != u_bbox.size(1):
# confidence is not part-based, don't try to fill it part by part
continue
data[2 + i][labels == part_id] = resampled_confidence[key][0, part_id][
labels == part_id
]
if confidences is not None:
for i, key in enumerate(confidences):
if resampled_confidence[key].size(1) != u_bbox.size(1):
# confidence is not part-based, fill the data with the first channel
# (targeted for segmentation confidences that have only 1 channel)
data[2 + i] = resampled_confidence[key][0, 0]
return labels.unsqueeze(0), data
class DensePoseResult(object):
def __init__(self, boxes_xywh, S, I, U, V):
self.results = []
self.boxes_xywh = boxes_xywh.cpu().tolist()
assert len(boxes_xywh.size()) == 2
assert boxes_xywh.size(1) == 4
for i, box_xywh in enumerate(boxes_xywh):
result_i = self._output_to_result(box_xywh, S[[i]], I[[i]], U[[i]], V[[i]])
result_numpy_i = result_i.cpu().numpy()
result_encoded_i = DensePoseResult.encode_png_data(result_numpy_i)
result_encoded_with_shape_i = (result_numpy_i.shape, result_encoded_i)
self.results.append(result_encoded_with_shape_i)
def __str__(self):
s = "DensePoseResult: N={} [{}]".format(
len(self.results), ", ".join([str(list(r[0])) for r in self.results])
)
return s
def _output_to_result(self, box_xywh, S, I, U, V):
# TODO: reuse resample_output_to_bbox
x, y, w, h = box_xywh
w = max(int(w), 1)
h = max(int(h), 1)
result = torch.zeros([3, h, w], dtype=torch.uint8, device=U.device)
assert (
len(S.size()) == 4
), "AnnIndex tensor size should have {} " "dimensions but has {}".format(4, len(S.size()))
s_bbox = F.interpolate(S, (h, w), mode="bilinear", align_corners=False).argmax(dim=1)
assert (
len(I.size()) == 4
), "IndexUV tensor size should have {} " "dimensions but has {}".format(4, len(S.size()))
i_bbox = (
F.interpolate(I, (h, w), mode="bilinear", align_corners=False).argmax(dim=1)
* (s_bbox > 0).long()
).squeeze(0)
assert len(U.size()) == 4, "U tensor size should have {} " "dimensions but has {}".format(
4, len(U.size())
)
u_bbox = F.interpolate(U, (h, w), mode="bilinear", align_corners=False)
assert len(V.size()) == 4, "V tensor size should have {} " "dimensions but has {}".format(
4, len(V.size())
)
v_bbox = F.interpolate(V, (h, w), mode="bilinear", align_corners=False)
result[0] = i_bbox
for part_id in range(1, u_bbox.size(1)):
result[1][i_bbox == part_id] = (
(u_bbox[0, part_id][i_bbox == part_id] * 255).clamp(0, 255).to(torch.uint8)
)
result[2][i_bbox == part_id] = (
(v_bbox[0, part_id][i_bbox == part_id] * 255).clamp(0, 255).to(torch.uint8)
)
assert (
result.size(1) == h
), "Results height {} should be equal" "to bounding box height {}".format(result.size(1), h)
assert (
result.size(2) == w
), "Results width {} should be equal" "to bounding box width {}".format(result.size(2), w)
return result
@staticmethod
def encode_png_data(arr):
"""
Encode array data as a PNG image using the highest compression rate
@param arr [in] Data stored in an array of size (3, M, N) of type uint8
@return Base64-encoded string containing PNG-compressed data
"""
assert len(arr.shape) == 3, "Expected a 3D array as an input," " got a {0}D array".format(
len(arr.shape)
)
assert arr.shape[0] == 3, "Expected first array dimension of size 3," " got {0}".format(
arr.shape[0]
)
assert arr.dtype == np.uint8, "Expected an array of type np.uint8, " " got {0}".format(
arr.dtype
)
data = np.moveaxis(arr, 0, -1)
im = Image.fromarray(data)
fstream = BytesIO()
im.save(fstream, format="png", optimize=True)
s = base64.encodebytes(fstream.getvalue()).decode()
return s
@staticmethod
def decode_png_data(shape, s):
"""
Decode array data from a string that contains PNG-compressed data
@param Base64-encoded string containing PNG-compressed data
@return Data stored in an array of size (3, M, N) of type uint8
"""
fstream = BytesIO(base64.decodebytes(s.encode()))
im = Image.open(fstream)
data = np.moveaxis(np.array(im.getdata(), dtype=np.uint8), -1, 0)
return data.reshape(shape)
def __len__(self):
return len(self.results)
def __getitem__(self, item):
result_encoded = self.results[item]
bbox_xywh = self.boxes_xywh[item]
return result_encoded, bbox_xywh
class DensePoseList(object):
_TORCH_DEVICE_CPU = torch.device("cpu")
def __init__(self, densepose_datas, boxes_xyxy_abs, image_size_hw, device=_TORCH_DEVICE_CPU):
assert len(densepose_datas) == len(
boxes_xyxy_abs
), "Attempt to initialize DensePoseList with {} DensePose datas " "and {} boxes".format(
len(densepose_datas), len(boxes_xyxy_abs)
)
self.densepose_datas = []
for densepose_data in densepose_datas:
assert isinstance(densepose_data, DensePoseDataRelative) or densepose_data is None, (
"Attempt to initialize DensePoseList with DensePose datas "
"of type {}, expected DensePoseDataRelative".format(type(densepose_data))
)
densepose_data_ondevice = (
densepose_data.to(device) if densepose_data is not None else None
)
self.densepose_datas.append(densepose_data_ondevice)
self.boxes_xyxy_abs = boxes_xyxy_abs.to(device)
self.image_size_hw = image_size_hw
self.device = device
def to(self, device):
if self.device == device:
return self
return DensePoseList(self.densepose_datas, self.boxes_xyxy_abs, self.image_size_hw, device)
def __iter__(self):
return iter(self.densepose_datas)
def __len__(self):
return len(self.densepose_datas)
def __repr__(self):
s = self.__class__.__name__ + "("
s += "num_instances={}, ".format(len(self.densepose_datas))
s += "image_width={}, ".format(self.image_size_hw[1])
s += "image_height={})".format(self.image_size_hw[0])
return s
def __getitem__(self, item):
if isinstance(item, int):
densepose_data_rel = self.densepose_datas[item]
return densepose_data_rel
elif isinstance(item, slice):
densepose_datas_rel = self.densepose_datas[item]
boxes_xyxy_abs = self.boxes_xyxy_abs[item]
return DensePoseList(
densepose_datas_rel, boxes_xyxy_abs, self.image_size_hw, self.device
)
elif isinstance(item, torch.Tensor) and (item.dtype == torch.bool):
densepose_datas_rel = [self.densepose_datas[i] for i, x in enumerate(item) if x > 0]
boxes_xyxy_abs = self.boxes_xyxy_abs[item]
return DensePoseList(
densepose_datas_rel, boxes_xyxy_abs, self.image_size_hw, self.device
)
else:
densepose_datas_rel = [self.densepose_datas[i] for i in item]
boxes_xyxy_abs = self.boxes_xyxy_abs[item]
return DensePoseList(
densepose_datas_rel, boxes_xyxy_abs, self.image_size_hw, self.device
)

View File

@ -0,0 +1,3 @@
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
from .image import ImageResizeTransform

View File

@ -0,0 +1,37 @@
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
import torch
class ImageResizeTransform:
"""
Transform that converts frames loaded from a dataset
(RGB data in NHWC channel order, typically uint8) to a format ready to be
consumed by DensePose training (BGR float32 data in NCHW channel order)
"""
def __init__(self, min_size: int = 800, max_size: int = 1333):
self.min_size = min_size
self.max_size = max_size
def __call__(self, frames: torch.Tensor) -> torch.Tensor:
"""
Args:
frames (torch.Tensor): tensor of size [N, H, W, 3] that contains
RGB data (typically in uint8)
Returns:
frames (torch.Tensor): tensor of size [N, 3, H1, W1] where
H1 and W1 are chosen to respect the specified min and max sizes
and preserve the original aspect ratio, the data channels
follow BGR order and the data type is `torch.float32`
"""
frames = frames[..., [2, 1, 0]] # RGB -> BGR
frames = frames.permute(0, 3, 1, 2).float() # NHWC -> NCHW
# resize with min size
min_size = min(frames.shape[-2:])
max_size = max(frames.shape[-2:])
scale = min(self.min_size / min_size, self.max_size / max_size)
frames = torch.nn.functional.interpolate(
frames, scale_factor=scale, mode="bilinear", align_corners=False
)
return frames

View File

@ -0,0 +1,22 @@
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
import os
from typing import Optional
def is_relative_local_path(path: os.PathLike):
path_str = os.fsdecode(path)
return ("://" not in path_str) and not os.path.isabs(path)
def maybe_prepend_base_path(base_path: Optional[os.PathLike], path: os.PathLike):
"""
Prepends the provided path with a base path prefix if:
1) base path is not None;
2) path is a local path
"""
if base_path is None:
return path
if is_relative_local_path(path):
return os.path.join(base_path, path)
return path

View File

@ -0,0 +1,17 @@
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
from .frame_selector import (
FrameSelectionStrategy,
RandomKFramesSelector,
FirstKFramesSelector,
LastKFramesSelector,
FrameTsList,
FrameSelector,
)
from .video_keyframe_dataset import (
VideoKeyframeDataset,
video_list_from_file,
list_keyframes,
read_keyframes,
)

View File

@ -0,0 +1,87 @@
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
import random
from collections.abc import Callable
from enum import Enum
from typing import Callable as TCallable
from typing import List
FrameTsList = List[int]
FrameSelector = TCallable[[FrameTsList], FrameTsList]
class FrameSelectionStrategy(Enum):
"""
Frame selection strategy used with videos:
- "random_k": select k random frames
- "first_k": select k first frames
- "last_k": select k last frames
- "all": select all frames
"""
# fmt: off
RANDOM_K = "random_k"
FIRST_K = "first_k"
LAST_K = "last_k"
ALL = "all"
# fmt: on
class RandomKFramesSelector(Callable):
"""
Selector that retains at most `k` random frames
"""
def __init__(self, k: int):
self.k = k
def __call__(self, frame_tss: FrameTsList) -> FrameTsList:
"""
Select `k` random frames
Args:
frames_tss (List[int]): timestamps of input frames
Returns:
List[int]: timestamps of selected frames
"""
return random.sample(frame_tss, min(self.k, len(frame_tss)))
class FirstKFramesSelector(Callable):
"""
Selector that retains at most `k` first frames
"""
def __init__(self, k: int):
self.k = k
def __call__(self, frame_tss: FrameTsList) -> FrameTsList:
"""
Select `k` first frames
Args:
frames_tss (List[int]): timestamps of input frames
Returns:
List[int]: timestamps of selected frames
"""
return frame_tss[: self.k]
class LastKFramesSelector(Callable):
"""
Selector that retains at most `k` last frames from video data
"""
def __init__(self, k: int):
self.k = k
def __call__(self, frame_tss: FrameTsList) -> FrameTsList:
"""
Select `k` last frames
Args:
frames_tss (List[int]): timestamps of input frames
Returns:
List[int]: timestamps of selected frames
"""
return frame_tss[-self.k :]

View File

@ -0,0 +1,232 @@
# -*- coding: utf-8 -*-
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
import logging
import numpy as np
from typing import Callable, List, Optional
import torch
from fvcore.common.file_io import PathManager
from torch.utils.data.dataset import Dataset
import av
from ..utils import maybe_prepend_base_path
from .frame_selector import FrameSelector, FrameTsList
FrameList = List[av.frame.Frame]
FrameTransform = Callable[[torch.Tensor], torch.Tensor]
def list_keyframes(video_fpath: str, video_stream_idx: int = 0) -> FrameTsList:
"""
Traverses all keyframes of a video file. Returns a list of keyframe
timestamps. Timestamps are counts in timebase units.
Args:
video_fpath (str): Video file path
video_stream_idx (int): Video stream index (default: 0)
Returns:
List[int]: list of keyframe timestaps (timestamp is a count in timebase
units)
"""
try:
with PathManager.open(video_fpath, "rb") as io:
container = av.open(io, mode="r")
stream = container.streams.video[video_stream_idx]
keyframes = []
pts = -1
# Note: even though we request forward seeks for keyframes, sometimes
# a keyframe in backwards direction is returned. We introduce tolerance
# as a max count of ignored backward seeks
tolerance_backward_seeks = 2
while True:
try:
container.seek(pts + 1, backward=False, any_frame=False, stream=stream)
except av.AVError as e:
# the exception occurs when the video length is exceeded,
# we then return whatever data we've already collected
logger = logging.getLogger(__name__)
logger.debug(
f"List keyframes: Error seeking video file {video_fpath}, "
f"video stream {video_stream_idx}, pts {pts + 1}, AV error: {e}"
)
return keyframes
except OSError as e:
logger = logging.getLogger(__name__)
logger.warning(
f"List keyframes: Error seeking video file {video_fpath}, "
f"video stream {video_stream_idx}, pts {pts + 1}, OS error: {e}"
)
return []
packet = next(container.demux(video=video_stream_idx))
if packet.pts is not None and packet.pts <= pts:
logger = logging.getLogger(__name__)
logger.warning(
f"Video file {video_fpath}, stream {video_stream_idx}: "
f"bad seek for packet {pts + 1} (got packet {packet.pts}), "
f"tolerance {tolerance_backward_seeks}."
)
tolerance_backward_seeks -= 1
if tolerance_backward_seeks == 0:
return []
pts += 1
continue
tolerance_backward_seeks = 2
pts = packet.pts
if pts is None:
return keyframes
if packet.is_keyframe:
keyframes.append(pts)
return keyframes
except OSError as e:
logger = logging.getLogger(__name__)
logger.warning(
f"List keyframes: Error opening video file container {video_fpath}, " f"OS error: {e}"
)
except RuntimeError as e:
logger = logging.getLogger(__name__)
logger.warning(
f"List keyframes: Error opening video file container {video_fpath}, "
f"Runtime error: {e}"
)
return []
def read_keyframes(
video_fpath: str, keyframes: FrameTsList, video_stream_idx: int = 0
) -> FrameList:
"""
Reads keyframe data from a video file.
Args:
video_fpath (str): Video file path
keyframes (List[int]): List of keyframe timestamps (as counts in
timebase units to be used in container seek operations)
video_stream_idx (int): Video stream index (default: 0)
Returns:
List[Frame]: list of frames that correspond to the specified timestamps
"""
try:
with PathManager.open(video_fpath, "rb") as io:
container = av.open(io)
stream = container.streams.video[video_stream_idx]
frames = []
for pts in keyframes:
try:
container.seek(pts, any_frame=False, stream=stream)
frame = next(container.decode(video=0))
frames.append(frame)
except av.AVError as e:
logger = logging.getLogger(__name__)
logger.warning(
f"Read keyframes: Error seeking video file {video_fpath}, "
f"video stream {video_stream_idx}, pts {pts}, AV error: {e}"
)
container.close()
return frames
except OSError as e:
logger = logging.getLogger(__name__)
logger.warning(
f"Read keyframes: Error seeking video file {video_fpath}, "
f"video stream {video_stream_idx}, pts {pts}, OS error: {e}"
)
container.close()
return frames
except StopIteration:
logger = logging.getLogger(__name__)
logger.warning(
f"Read keyframes: Error decoding frame from {video_fpath}, "
f"video stream {video_stream_idx}, pts {pts}"
)
container.close()
return frames
container.close()
return frames
except OSError as e:
logger = logging.getLogger(__name__)
logger.warning(
f"Read keyframes: Error opening video file container {video_fpath}, OS error: {e}"
)
except RuntimeError as e:
logger = logging.getLogger(__name__)
logger.warning(
f"Read keyframes: Error opening video file container {video_fpath}, Runtime error: {e}"
)
return []
def video_list_from_file(video_list_fpath: str, base_path: Optional[str] = None):
"""
Create a list of paths to video files from a text file.
Args:
video_list_fpath (str): path to a plain text file with the list of videos
base_path (str): base path for entries from the video list (default: None)
"""
video_list = []
with PathManager.open(video_list_fpath, "r") as io:
for line in io:
video_list.append(maybe_prepend_base_path(base_path, line.strip()))
return video_list
class VideoKeyframeDataset(Dataset):
"""
Dataset that provides keyframes for a set of videos.
"""
_EMPTY_FRAMES = torch.empty((0, 3, 1, 1))
def __init__(
self,
video_list: List[str],
frame_selector: Optional[FrameSelector] = None,
transform: Optional[FrameTransform] = None,
):
"""
Dataset constructor
Args:
video_list (List[str]): list of paths to video files
frame_selector (Callable: KeyFrameList -> KeyFrameList):
selects keyframes to process, keyframes are given by
packet timestamps in timebase counts. If None, all keyframes
are selected (default: None)
transform (Callable: torch.Tensor -> torch.Tensor):
transforms a batch of RGB images (tensors of size [B, H, W, 3]),
returns a tensor of the same size. If None, no transform is
applied (default: None)
"""
self.video_list = video_list
self.frame_selector = frame_selector
self.transform = transform
def __getitem__(self, idx: int) -> torch.Tensor:
"""
Gets selected keyframes from a given video
Args:
idx (int): video index in the video list file
Returns:
frames (torch.Tensor): tensor of size [N, H, W, 3] or of size
defined by the transform that contains keyframes data
"""
fpath = self.video_list[idx]
keyframes = list_keyframes(fpath)
if not keyframes:
return self._EMPTY_FRAMES
if self.frame_selector is not None:
keyframes = self.frame_selector(keyframes)
frames = read_keyframes(fpath, keyframes)
if not frames:
return self._EMPTY_FRAMES
frames = np.stack([frame.to_rgb().to_ndarray() for frame in frames])
frames = torch.as_tensor(frames, device=torch.device("cpu"))
if self.transform is not None:
frames = self.transform(frames)
return frames
def __len__(self):
return len(self.video_list)

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,3 @@
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
from .trainer import Trainer

View File

@ -0,0 +1,118 @@
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
import logging
import os
from collections import OrderedDict
from detectron2.checkpoint import DetectionCheckpointer
from detectron2.config import CfgNode
from detectron2.engine import DefaultTrainer
from detectron2.evaluation import COCOEvaluator, DatasetEvaluators
from detectron2.utils.events import EventWriter, get_event_storage
from densepose import (
DensePoseCOCOEvaluator,
DensePoseDatasetMapperTTA,
DensePoseGeneralizedRCNNWithTTA,
load_from_cfg,
)
from densepose.data import (
DatasetMapper,
build_combined_loader,
build_detection_test_loader,
build_detection_train_loader,
build_inference_based_loaders,
has_inference_based_loaders,
)
class SampleCountingLoader:
def __init__(self, loader):
self.loader = loader
def __iter__(self):
it = iter(self.loader)
storage = get_event_storage()
while True:
try:
batch = next(it)
num_inst_per_dataset = {}
for data in batch:
dataset_name = data["dataset"]
if dataset_name not in num_inst_per_dataset:
num_inst_per_dataset[dataset_name] = 0
num_inst = len(data["instances"])
num_inst_per_dataset[dataset_name] += num_inst
for dataset_name in num_inst_per_dataset:
storage.put_scalar(f"batch/{dataset_name}", num_inst_per_dataset[dataset_name])
yield batch
except StopIteration:
break
class SampleCountMetricPrinter(EventWriter):
def __init__(self):
self.logger = logging.getLogger(__name__)
def write(self):
storage = get_event_storage()
batch_stats_strs = []
for key, buf in storage.histories().items():
if key.startswith("batch/"):
batch_stats_strs.append(f"{key} {buf.avg(20)}")
self.logger.info(", ".join(batch_stats_strs))
class Trainer(DefaultTrainer):
@classmethod
def build_evaluator(cls, cfg: CfgNode, dataset_name, output_folder=None):
if output_folder is None:
output_folder = os.path.join(cfg.OUTPUT_DIR, "inference")
evaluators = [COCOEvaluator(dataset_name, cfg, True, output_folder)]
if cfg.MODEL.DENSEPOSE_ON:
evaluators.append(DensePoseCOCOEvaluator(dataset_name, True, output_folder))
return DatasetEvaluators(evaluators)
@classmethod
def build_test_loader(cls, cfg: CfgNode, dataset_name):
return build_detection_test_loader(cfg, dataset_name, mapper=DatasetMapper(cfg, False))
@classmethod
def build_train_loader(cls, cfg: CfgNode):
data_loader = build_detection_train_loader(cfg, mapper=DatasetMapper(cfg, True))
if not has_inference_based_loaders(cfg):
return data_loader
model = cls.build_model(cfg)
model.to(cfg.BOOTSTRAP_MODEL.DEVICE)
DetectionCheckpointer(model).resume_or_load(cfg.BOOTSTRAP_MODEL.WEIGHTS, resume=False)
inference_based_loaders, ratios = build_inference_based_loaders(cfg, model)
loaders = [data_loader] + inference_based_loaders
ratios = [1.0] + ratios
combined_data_loader = build_combined_loader(cfg, loaders, ratios)
sample_counting_loader = SampleCountingLoader(combined_data_loader)
return sample_counting_loader
def build_writers(self):
writers = super().build_writers()
writers.append(SampleCountMetricPrinter())
return writers
@classmethod
def test_with_TTA(cls, cfg: CfgNode, model):
logger = logging.getLogger("detectron2.trainer")
# In the end of training, run an evaluation with TTA
# Only support some R-CNN models.
logger.info("Running inference with test-time augmentation ...")
transform_data = load_from_cfg(cfg)
model = DensePoseGeneralizedRCNNWithTTA(
cfg, model, transform_data, DensePoseDatasetMapperTTA(cfg)
)
evaluators = [
cls.build_evaluator(
cfg, name, output_folder=os.path.join(cfg.OUTPUT_DIR, "inference_TTA")
)
for name in cfg.DATASETS.TEST
]
res = cls.test(cfg, model, evaluators)
res = OrderedDict({k + "_TTA": v for k, v in res.items()})
return res

View File

@ -0,0 +1,224 @@
# -*- coding: utf-8 -*-
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
import contextlib
import copy
import io
import itertools
import logging
import numpy as np
import os
from collections import OrderedDict
import pycocotools.mask as mask_utils
import torch
from fvcore.common.file_io import PathManager
from pycocotools.coco import COCO
from detectron2.data import MetadataCatalog
from detectron2.evaluation import DatasetEvaluator
from detectron2.structures import BoxMode
from detectron2.utils.comm import all_gather, is_main_process, synchronize
from detectron2.utils.logger import create_small_table
from .data.samplers import densepose_to_mask
from .densepose_coco_evaluation import DensePoseCocoEval, DensePoseEvalMode
class DensePoseCOCOEvaluator(DatasetEvaluator):
def __init__(self, dataset_name, distributed, output_dir=None):
self._distributed = distributed
self._output_dir = output_dir
self._cpu_device = torch.device("cpu")
self._logger = logging.getLogger(__name__)
self._metadata = MetadataCatalog.get(dataset_name)
self._min_threshold = 0.5
json_file = PathManager.get_local_path(self._metadata.json_file)
with contextlib.redirect_stdout(io.StringIO()):
self._coco_api = COCO(json_file)
def reset(self):
self._predictions = []
def process(self, inputs, outputs):
"""
Args:
inputs: the inputs to a COCO model (e.g., GeneralizedRCNN).
It is a list of dict. Each dict corresponds to an image and
contains keys like "height", "width", "file_name", "image_id".
outputs: the outputs of a COCO model. It is a list of dicts with key
"instances" that contains :class:`Instances`.
The :class:`Instances` object needs to have `densepose` field.
"""
for input, output in zip(inputs, outputs):
instances = output["instances"].to(self._cpu_device)
json_results = prediction_to_json(instances, input["image_id"])
self._predictions.extend(json_results)
def evaluate(self):
if self._distributed:
synchronize()
predictions = all_gather(self._predictions)
predictions = list(itertools.chain(*predictions))
if not is_main_process():
return
else:
predictions = self._predictions
return copy.deepcopy(self._eval_predictions(predictions))
def _eval_predictions(self, predictions):
"""
Evaluate predictions on densepose.
Return results with the metrics of the tasks.
"""
self._logger.info("Preparing results for COCO format ...")
if self._output_dir:
PathManager.mkdirs(self._output_dir)
file_path = os.path.join(self._output_dir, "coco_densepose_predictions.pth")
with PathManager.open(file_path, "wb") as f:
torch.save(predictions, f)
self._logger.info("Evaluating predictions ...")
res = OrderedDict()
results_gps, results_gpsm, results_segm = _evaluate_predictions_on_coco(
self._coco_api, predictions, min_threshold=self._min_threshold
)
res["densepose_gps"] = results_gps
res["densepose_gpsm"] = results_gpsm
res["densepose_segm"] = results_segm
return res
def prediction_to_json(instances, img_id):
"""
Args:
instances (Instances): the output of the model
img_id (str): the image id in COCO
Returns:
list[dict]: the results in densepose evaluation format
"""
scores = instances.scores.tolist()
segmentations = densepose_to_mask(instances)
boxes = instances.pred_boxes.tensor.clone()
boxes = BoxMode.convert(boxes, BoxMode.XYXY_ABS, BoxMode.XYWH_ABS)
instances.pred_densepose = instances.pred_densepose.to_result(boxes)
results = []
for k in range(len(instances)):
densepose = instances.pred_densepose[k]
segmentation = segmentations.tensor[k]
segmentation_encoded = mask_utils.encode(
np.require(segmentation.numpy(), dtype=np.uint8, requirements=["F"])
)
segmentation_encoded["counts"] = segmentation_encoded["counts"].decode("utf-8")
result = {
"image_id": img_id,
"category_id": 1, # densepose only has one class
"bbox": densepose[1],
"score": scores[k],
"densepose": densepose,
"segmentation": segmentation_encoded,
}
results.append(result)
return results
def _evaluate_predictions_on_coco(coco_gt, coco_results, min_threshold=0.5):
logger = logging.getLogger(__name__)
segm_metrics = _get_segmentation_metrics()
densepose_metrics = _get_densepose_metrics(min_threshold)
if len(coco_results) == 0: # cocoapi does not handle empty results very well
logger.warn("No predictions from the model! Set scores to -1")
results_gps = {metric: -1 for metric in densepose_metrics}
results_gpsm = {metric: -1 for metric in densepose_metrics}
results_segm = {metric: -1 for metric in segm_metrics}
return results_gps, results_gpsm, results_segm
coco_dt = coco_gt.loadRes(coco_results)
results_segm = _evaluate_predictions_on_coco_segm(coco_gt, coco_dt, segm_metrics, min_threshold)
logger.info("Evaluation results for densepose segm: \n" + create_small_table(results_segm))
results_gps = _evaluate_predictions_on_coco_gps(
coco_gt, coco_dt, densepose_metrics, min_threshold
)
logger.info(
"Evaluation results for densepose, GPS metric: \n" + create_small_table(results_gps)
)
results_gpsm = _evaluate_predictions_on_coco_gpsm(
coco_gt, coco_dt, densepose_metrics, min_threshold
)
logger.info(
"Evaluation results for densepose, GPSm metric: \n" + create_small_table(results_gpsm)
)
return results_gps, results_gpsm, results_segm
def _get_densepose_metrics(min_threshold=0.5):
metrics = ["AP"]
if min_threshold <= 0.201:
metrics += ["AP20"]
if min_threshold <= 0.301:
metrics += ["AP30"]
if min_threshold <= 0.401:
metrics += ["AP40"]
metrics.extend(["AP50", "AP75", "APm", "APl", "AR", "AR50", "AR75", "ARm", "ARl"])
return metrics
def _get_segmentation_metrics():
return [
"AP",
"AP50",
"AP75",
"APs",
"APm",
"APl",
"AR@1",
"AR@10",
"AR@100",
"ARs",
"ARm",
"ARl",
]
def _evaluate_predictions_on_coco_gps(coco_gt, coco_dt, metrics, min_threshold=0.5):
coco_eval = DensePoseCocoEval(coco_gt, coco_dt, "densepose", dpEvalMode=DensePoseEvalMode.GPS)
coco_eval.params.iouThrs = np.linspace(
min_threshold, 0.95, int(np.round((0.95 - min_threshold) / 0.05)) + 1, endpoint=True
)
coco_eval.evaluate()
coco_eval.accumulate()
coco_eval.summarize()
results = {metric: float(coco_eval.stats[idx] * 100) for idx, metric in enumerate(metrics)}
return results
def _evaluate_predictions_on_coco_gpsm(coco_gt, coco_dt, metrics, min_threshold=0.5):
coco_eval = DensePoseCocoEval(coco_gt, coco_dt, "densepose", dpEvalMode=DensePoseEvalMode.GPSM)
coco_eval.params.iouThrs = np.linspace(
min_threshold, 0.95, int(np.round((0.95 - min_threshold) / 0.05)) + 1, endpoint=True
)
coco_eval.evaluate()
coco_eval.accumulate()
coco_eval.summarize()
results = {metric: float(coco_eval.stats[idx] * 100) for idx, metric in enumerate(metrics)}
return results
def _evaluate_predictions_on_coco_segm(coco_gt, coco_dt, metrics, min_threshold=0.5):
coco_eval = DensePoseCocoEval(coco_gt, coco_dt, "segm")
coco_eval.params.iouThrs = np.linspace(
min_threshold, 0.95, int(np.round((0.95 - min_threshold) / 0.05)) + 1, endpoint=True
)
coco_eval.evaluate()
coco_eval.accumulate()
coco_eval.summarize()
results = {metric: float(coco_eval.stats[idx] * 100) for idx, metric in enumerate(metrics)}
return results

View File

@ -0,0 +1,66 @@
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
from detectron2.config import CfgNode
from .filter import DensePoseDataFilter
from .losses import DensePoseLosses
from .predictors import DensePoseChartWithConfidencePredictor
def build_densepose_predictor(cfg: CfgNode, input_channels: int):
"""
Create an instance of DensePose predictor based on configuration options.
Args:
cfg (CfgNode): configuration options
input_channels (int): input tensor size along the channel dimension
Return:
An instance of DensePose predictor
"""
predictor = DensePoseChartWithConfidencePredictor(cfg, input_channels)
return predictor
def build_densepose_data_filter(cfg: CfgNode):
"""
Build DensePose data filter which selects data for training
Args:
cfg (CfgNode): configuration options
Return:
Callable: list(Tensor), list(Instances) -> list(Tensor), list(Instances)
An instance of DensePose filter, which takes feature tensors and proposals
as an input and returns filtered features and proposals
"""
dp_filter = DensePoseDataFilter(cfg)
return dp_filter
def build_densepose_head(cfg: CfgNode, input_channels: int):
"""
Build DensePose head based on configurations options
Args:
cfg (CfgNode): configuration options
input_channels (int): input tensor size along the channel dimension
Return:
An instance of DensePose head
"""
from .roi_heads.registry import ROI_DENSEPOSE_HEAD_REGISTRY
head_name = cfg.MODEL.ROI_DENSEPOSE_HEAD.NAME
return ROI_DENSEPOSE_HEAD_REGISTRY.get(head_name)(cfg, input_channels)
def build_densepose_losses(cfg: CfgNode):
"""
Build DensePose loss based on configurations options
Args:
cfg (CfgNode): configuration options
Return:
An instance of DensePose loss
"""
losses = DensePoseLosses(cfg)
return losses

View File

@ -0,0 +1,73 @@
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
from dataclasses import dataclass
from enum import Enum
from detectron2.config import CfgNode
class DensePoseUVConfidenceType(Enum):
"""
Statistical model type for confidence learning, possible values:
- "iid_iso": statistically independent identically distributed residuals
with anisotropic covariance
- "indep_aniso": statistically independent residuals with anisotropic
covariances
For details, see:
N. Neverova, D. Novotny, A. Vedaldi "Correlated Uncertainty for Learning
Dense Correspondences from Noisy Labels", p. 918--926, in Proc. NIPS 2019
"""
# fmt: off
IID_ISO = "iid_iso"
INDEP_ANISO = "indep_aniso"
# fmt: on
@dataclass
class DensePoseUVConfidenceConfig:
"""
Configuration options for confidence on UV data
"""
enabled: bool = False
# lower bound on UV confidences
epsilon: float = 0.01
type: DensePoseUVConfidenceType = DensePoseUVConfidenceType.IID_ISO
@dataclass
class DensePoseSegmConfidenceConfig:
"""
Configuration options for confidence on segmentation
"""
enabled: bool = False
# lower bound on confidence values
epsilon: float = 0.01
@dataclass
class DensePoseConfidenceModelConfig:
"""
Configuration options for confidence models
"""
# confidence for U and V values
uv_confidence: DensePoseUVConfidenceConfig
# segmentation confidence
segm_confidence: DensePoseSegmConfidenceConfig
@staticmethod
def from_cfg(cfg: CfgNode) -> "DensePoseConfidenceModelConfig":
return DensePoseConfidenceModelConfig(
uv_confidence=DensePoseUVConfidenceConfig(
enabled=cfg.MODEL.ROI_DENSEPOSE_HEAD.UV_CONFIDENCE.ENABLED,
epsilon=cfg.MODEL.ROI_DENSEPOSE_HEAD.UV_CONFIDENCE.EPSILON,
type=DensePoseUVConfidenceType(cfg.MODEL.ROI_DENSEPOSE_HEAD.UV_CONFIDENCE.TYPE),
),
segm_confidence=DensePoseSegmConfidenceConfig(
enabled=cfg.MODEL.ROI_DENSEPOSE_HEAD.SEGM_CONFIDENCE.ENABLED,
epsilon=cfg.MODEL.ROI_DENSEPOSE_HEAD.SEGM_CONFIDENCE.EPSILON,
),
)

View File

@ -0,0 +1,35 @@
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
from collections import OrderedDict
from detectron2.checkpoint import DetectionCheckpointer
def _rename_HRNet_weights(weights):
# We detect and rename HRNet weights for DensePose. 1956 and 1716 are values that are
# common to all HRNet pretrained weights, and should be enough to accurately identify them
if (
len(weights["model"].keys()) == 1956
and len([k for k in weights["model"].keys() if k.startswith("stage")]) == 1716
):
hrnet_weights = OrderedDict()
for k in weights["model"].keys():
hrnet_weights["backbone.bottom_up." + str(k)] = weights["model"][k]
return {"model": hrnet_weights}
else:
return weights
class DensePoseCheckpointer(DetectionCheckpointer):
"""
Same as :class:`DetectionCheckpointer`, but is able to handle HRNet weights
"""
def __init__(self, model, save_dir="", *, save_to_disk=None, **checkpointables):
super().__init__(model, save_dir, save_to_disk=save_to_disk, **checkpointables)
def _load_file(self, filename: str) -> object:
"""
Adding hrnet support
"""
weights = super()._load_file(filename)
return _rename_HRNet_weights(weights)

View File

@ -0,0 +1,94 @@
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
from typing import List
import torch
from detectron2.config import CfgNode
from detectron2.structures import Instances
from detectron2.structures.boxes import matched_boxlist_iou
class DensePoseDataFilter(object):
def __init__(self, cfg: CfgNode):
self.iou_threshold = cfg.MODEL.ROI_DENSEPOSE_HEAD.FG_IOU_THRESHOLD
self.keep_masks = cfg.MODEL.ROI_DENSEPOSE_HEAD.COARSE_SEGM_TRAINED_BY_MASKS
@torch.no_grad()
def __call__(self, features: List[torch.Tensor], proposals_with_targets: List[Instances]):
"""
Filters proposals with targets to keep only the ones relevant for
DensePose training
Args:
features (list[Tensor]): input data as a list of features,
each feature is a tensor. Axis 0 represents the number of
images `N` in the input data; axes 1-3 are channels,
height, and width, which may vary between features
(e.g., if a feature pyramid is used).
proposals_with_targets (list[Instances]): length `N` list of
`Instances`. The i-th `Instances` contains instances
(proposals, GT) for the i-th input image,
Returns:
list[Tensor]: filtered features
list[Instances]: filtered proposals
"""
proposals_filtered = []
# TODO: the commented out code was supposed to correctly deal with situations
# where no valid DensePose GT is available for certain images. The corresponding
# image features were sliced and proposals were filtered. This led to performance
# deterioration, both in terms of runtime and in terms of evaluation results.
#
# feature_mask = torch.ones(
# len(proposals_with_targets),
# dtype=torch.bool,
# device=features[0].device if len(features) > 0 else torch.device("cpu"),
# )
for i, proposals_per_image in enumerate(proposals_with_targets):
if not proposals_per_image.has("gt_densepose") and (
not proposals_per_image.has("gt_masks") or not self.keep_masks
):
# feature_mask[i] = 0
continue
gt_boxes = proposals_per_image.gt_boxes
est_boxes = proposals_per_image.proposal_boxes
# apply match threshold for densepose head
iou = matched_boxlist_iou(gt_boxes, est_boxes)
iou_select = iou > self.iou_threshold
proposals_per_image = proposals_per_image[iou_select]
N_gt_boxes = len(proposals_per_image.gt_boxes)
assert N_gt_boxes == len(proposals_per_image.proposal_boxes), (
f"The number of GT boxes {N_gt_boxes} is different from the "
f"number of proposal boxes {len(proposals_per_image.proposal_boxes)}"
)
# filter out any target without suitable annotation
if self.keep_masks:
gt_masks = (
proposals_per_image.gt_masks
if hasattr(proposals_per_image, "gt_masks")
else [None] * N_gt_boxes
)
else:
gt_masks = [None] * N_gt_boxes
gt_densepose = (
proposals_per_image.gt_densepose
if hasattr(proposals_per_image, "gt_densepose")
else [None] * N_gt_boxes
)
assert len(gt_masks) == N_gt_boxes
assert len(gt_densepose) == N_gt_boxes
selected_indices = [
i
for i, (dp_target, mask_target) in enumerate(zip(gt_densepose, gt_masks))
if (dp_target is not None) or (mask_target is not None)
]
# if not len(selected_indices):
# feature_mask[i] = 0
# continue
if len(selected_indices) != N_gt_boxes:
proposals_per_image = proposals_per_image[selected_indices]
assert len(proposals_per_image.gt_boxes) == len(proposals_per_image.proposal_boxes)
proposals_filtered.append(proposals_per_image)
# features_filtered = [feature[feature_mask] for feature in features]
# return features_filtered, proposals_filtered
return features, proposals_filtered

View File

@ -0,0 +1,181 @@
"""
MIT License
Copyright (c) 2019 Microsoft
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
"""
import torch
import torch.nn as nn
import torch.nn.functional as F
from detectron2.layers import ShapeSpec
from detectron2.modeling.backbone import BACKBONE_REGISTRY
from detectron2.modeling.backbone.backbone import Backbone
from .hrnet import build_pose_hrnet_backbone
class HRFPN(Backbone):
""" HRFPN (High Resolution Feature Pyramids)
Transforms outputs of HRNet backbone so they are suitable for the ROI_heads
arXiv: https://arxiv.org/abs/1904.04514
Adapted from https://github.com/open-mmlab/mmdetection/blob/master/mmdet/models/necks/hrfpn.py
Args:
bottom_up: (list) output of HRNet
in_features (list): names of the input features (output of HRNet)
in_channels (list): number of channels for each branch
out_channels (int): output channels of feature pyramids
n_out_features (int): number of output stages
pooling (str): pooling for generating feature pyramids (from {MAX, AVG})
share_conv (bool): Have one conv per output, or share one with all the outputs
"""
def __init__(
self,
bottom_up,
in_features,
n_out_features,
in_channels,
out_channels,
pooling="AVG",
share_conv=False,
):
super(HRFPN, self).__init__()
assert isinstance(in_channels, list)
self.bottom_up = bottom_up
self.in_features = in_features
self.n_out_features = n_out_features
self.in_channels = in_channels
self.out_channels = out_channels
self.num_ins = len(in_channels)
self.share_conv = share_conv
if self.share_conv:
self.fpn_conv = nn.Conv2d(
in_channels=out_channels, out_channels=out_channels, kernel_size=3, padding=1
)
else:
self.fpn_conv = nn.ModuleList()
for _ in range(self.n_out_features):
self.fpn_conv.append(
nn.Conv2d(
in_channels=out_channels,
out_channels=out_channels,
kernel_size=3,
padding=1,
)
)
# Custom change: Replaces a simple bilinear interpolation
self.interp_conv = nn.ModuleList()
for i in range(len(self.in_features)):
self.interp_conv.append(
nn.Sequential(
nn.ConvTranspose2d(
in_channels=in_channels[i],
out_channels=in_channels[i],
kernel_size=4,
stride=2 ** i,
padding=0,
output_padding=0,
bias=False,
),
nn.BatchNorm2d(in_channels[i], momentum=0.1),
nn.ReLU(inplace=True),
)
)
# Custom change: Replaces a couple (reduction conv + pooling) by one conv
self.reduction_pooling_conv = nn.ModuleList()
for i in range(self.n_out_features):
self.reduction_pooling_conv.append(
nn.Sequential(
nn.Conv2d(sum(in_channels), out_channels, kernel_size=2 ** i, stride=2 ** i),
nn.BatchNorm2d(out_channels, momentum=0.1),
nn.ReLU(inplace=True),
)
)
if pooling == "MAX":
self.pooling = F.max_pool2d
else:
self.pooling = F.avg_pool2d
self._out_features = []
self._out_feature_channels = {}
self._out_feature_strides = {}
for i in range(self.n_out_features):
self._out_features.append("p%d" % (i + 1))
self._out_feature_channels.update({self._out_features[-1]: self.out_channels})
self._out_feature_strides.update({self._out_features[-1]: 2 ** (i + 2)})
# default init_weights for conv(msra) and norm in ConvModule
def init_weights(self):
for m in self.modules():
if isinstance(m, nn.Conv2d):
nn.init.kaiming_normal_(m.weight, a=1)
nn.init.constant_(m.bias, 0)
def forward(self, inputs):
bottom_up_features = self.bottom_up(inputs)
assert len(bottom_up_features) == len(self.in_features)
inputs = [bottom_up_features[f] for f in self.in_features]
outs = []
for i in range(len(inputs)):
outs.append(self.interp_conv[i](inputs[i]))
shape_2 = min(o.shape[2] for o in outs)
shape_3 = min(o.shape[3] for o in outs)
out = torch.cat([o[:, :, :shape_2, :shape_3] for o in outs], dim=1)
outs = []
for i in range(self.n_out_features):
outs.append(self.reduction_pooling_conv[i](out))
for i in range(len(outs)): # Make shapes consistent
outs[-1 - i] = outs[-1 - i][
:, :, : outs[-1].shape[2] * 2 ** i, : outs[-1].shape[3] * 2 ** i
]
outputs = []
for i in range(len(outs)):
if self.share_conv:
outputs.append(self.fpn_conv(outs[i]))
else:
outputs.append(self.fpn_conv[i](outs[i]))
assert len(self._out_features) == len(outputs)
return dict(zip(self._out_features, outputs))
@BACKBONE_REGISTRY.register()
def build_hrfpn_backbone(cfg, input_shape: ShapeSpec):
in_channels = cfg.MODEL.HRNET.STAGE4.NUM_CHANNELS
in_features = ["p%d" % (i + 1) for i in range(cfg.MODEL.HRNET.STAGE4.NUM_BRANCHES)]
n_out_features = len(cfg.MODEL.ROI_HEADS.IN_FEATURES)
out_channels = cfg.MODEL.HRNET.HRFPN.OUT_CHANNELS
hrnet = build_pose_hrnet_backbone(cfg, input_shape)
hrfpn = HRFPN(
hrnet,
in_features,
n_out_features,
in_channels,
out_channels,
pooling="AVG",
share_conv=False,
)
return hrfpn

View File

@ -0,0 +1,473 @@
# ------------------------------------------------------------------------------
# Copyright (c) Microsoft
# Licensed under the MIT License.
# Written by Bin Xiao (leoxiaobin@gmail.com)
# Modified by Bowen Cheng (bcheng9@illinois.edu)
# Adapted from https://github.com/HRNet/Higher-HRNet-Human-Pose-Estimation/blob/master/lib/models/pose_higher_hrnet.py # noqa
# ------------------------------------------------------------------------------
from __future__ import absolute_import, division, print_function
import logging
import torch.nn as nn
from detectron2.layers import ShapeSpec
from detectron2.modeling.backbone import BACKBONE_REGISTRY
from detectron2.modeling.backbone.backbone import Backbone
BN_MOMENTUM = 0.1
logger = logging.getLogger(__name__)
__all__ = ["build_pose_hrnet_backbone", "PoseHigherResolutionNet"]
def conv3x3(in_planes, out_planes, stride=1):
"""3x3 convolution with padding"""
return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride, padding=1, bias=False)
class BasicBlock(nn.Module):
expansion = 1
def __init__(self, inplanes, planes, stride=1, downsample=None):
super(BasicBlock, self).__init__()
self.conv1 = conv3x3(inplanes, planes, stride)
self.bn1 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM)
self.relu = nn.ReLU(inplace=True)
self.conv2 = conv3x3(planes, planes)
self.bn2 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM)
self.downsample = downsample
self.stride = stride
def forward(self, x):
residual = x
out = self.conv1(x)
out = self.bn1(out)
out = self.relu(out)
out = self.conv2(out)
out = self.bn2(out)
if self.downsample is not None:
residual = self.downsample(x)
out += residual
out = self.relu(out)
return out
class Bottleneck(nn.Module):
expansion = 4
def __init__(self, inplanes, planes, stride=1, downsample=None):
super(Bottleneck, self).__init__()
self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False)
self.bn1 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM)
self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride, padding=1, bias=False)
self.bn2 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM)
self.conv3 = nn.Conv2d(planes, planes * self.expansion, kernel_size=1, bias=False)
self.bn3 = nn.BatchNorm2d(planes * self.expansion, momentum=BN_MOMENTUM)
self.relu = nn.ReLU(inplace=True)
self.downsample = downsample
self.stride = stride
def forward(self, x):
residual = x
out = self.conv1(x)
out = self.bn1(out)
out = self.relu(out)
out = self.conv2(out)
out = self.bn2(out)
out = self.relu(out)
out = self.conv3(out)
out = self.bn3(out)
if self.downsample is not None:
residual = self.downsample(x)
out += residual
out = self.relu(out)
return out
class HighResolutionModule(nn.Module):
""" HighResolutionModule
Building block of the PoseHigherResolutionNet (see lower)
arXiv: https://arxiv.org/abs/1908.10357
Args:
num_branches (int): number of branches of the modyle
blocks (str): type of block of the module
num_blocks (int): number of blocks of the module
num_inchannels (int): number of input channels of the module
num_channels (list): number of channels of each branch
multi_scale_output (bool): only used by the last module of PoseHigherResolutionNet
"""
def __init__(
self,
num_branches,
blocks,
num_blocks,
num_inchannels,
num_channels,
multi_scale_output=True,
):
super(HighResolutionModule, self).__init__()
self._check_branches(num_branches, blocks, num_blocks, num_inchannels, num_channels)
self.num_inchannels = num_inchannels
self.num_branches = num_branches
self.multi_scale_output = multi_scale_output
self.branches = self._make_branches(num_branches, blocks, num_blocks, num_channels)
self.fuse_layers = self._make_fuse_layers()
self.relu = nn.ReLU(True)
def _check_branches(self, num_branches, blocks, num_blocks, num_inchannels, num_channels):
if num_branches != len(num_blocks):
error_msg = "NUM_BRANCHES({}) <> NUM_BLOCKS({})".format(num_branches, len(num_blocks))
logger.error(error_msg)
raise ValueError(error_msg)
if num_branches != len(num_channels):
error_msg = "NUM_BRANCHES({}) <> NUM_CHANNELS({})".format(
num_branches, len(num_channels)
)
logger.error(error_msg)
raise ValueError(error_msg)
if num_branches != len(num_inchannels):
error_msg = "NUM_BRANCHES({}) <> NUM_INCHANNELS({})".format(
num_branches, len(num_inchannels)
)
logger.error(error_msg)
raise ValueError(error_msg)
def _make_one_branch(self, branch_index, block, num_blocks, num_channels, stride=1):
downsample = None
if (
stride != 1
or self.num_inchannels[branch_index] != num_channels[branch_index] * block.expansion
):
downsample = nn.Sequential(
nn.Conv2d(
self.num_inchannels[branch_index],
num_channels[branch_index] * block.expansion,
kernel_size=1,
stride=stride,
bias=False,
),
nn.BatchNorm2d(num_channels[branch_index] * block.expansion, momentum=BN_MOMENTUM),
)
layers = []
layers.append(
block(self.num_inchannels[branch_index], num_channels[branch_index], stride, downsample)
)
self.num_inchannels[branch_index] = num_channels[branch_index] * block.expansion
for _ in range(1, num_blocks[branch_index]):
layers.append(block(self.num_inchannels[branch_index], num_channels[branch_index]))
return nn.Sequential(*layers)
def _make_branches(self, num_branches, block, num_blocks, num_channels):
branches = []
for i in range(num_branches):
branches.append(self._make_one_branch(i, block, num_blocks, num_channels))
return nn.ModuleList(branches)
def _make_fuse_layers(self):
if self.num_branches == 1:
return None
num_branches = self.num_branches
num_inchannels = self.num_inchannels
fuse_layers = []
for i in range(num_branches if self.multi_scale_output else 1):
fuse_layer = []
for j in range(num_branches):
if j > i:
fuse_layer.append(
nn.Sequential(
nn.Conv2d(num_inchannels[j], num_inchannels[i], 1, 1, 0, bias=False),
nn.BatchNorm2d(num_inchannels[i]),
nn.Upsample(scale_factor=2 ** (j - i), mode="nearest"),
)
)
elif j == i:
fuse_layer.append(None)
else:
conv3x3s = []
for k in range(i - j):
if k == i - j - 1:
num_outchannels_conv3x3 = num_inchannels[i]
conv3x3s.append(
nn.Sequential(
nn.Conv2d(
num_inchannels[j],
num_outchannels_conv3x3,
3,
2,
1,
bias=False,
),
nn.BatchNorm2d(num_outchannels_conv3x3),
)
)
else:
num_outchannels_conv3x3 = num_inchannels[j]
conv3x3s.append(
nn.Sequential(
nn.Conv2d(
num_inchannels[j],
num_outchannels_conv3x3,
3,
2,
1,
bias=False,
),
nn.BatchNorm2d(num_outchannels_conv3x3),
nn.ReLU(True),
)
)
fuse_layer.append(nn.Sequential(*conv3x3s))
fuse_layers.append(nn.ModuleList(fuse_layer))
return nn.ModuleList(fuse_layers)
def get_num_inchannels(self):
return self.num_inchannels
def forward(self, x):
if self.num_branches == 1:
return [self.branches[0](x[0])]
for i in range(self.num_branches):
x[i] = self.branches[i](x[i])
x_fuse = []
for i in range(len(self.fuse_layers)):
y = x[0] if i == 0 else self.fuse_layers[i][0](x[0])
for j in range(1, self.num_branches):
if i == j:
y = y + x[j]
else:
z = self.fuse_layers[i][j](x[j])[:, :, : y.shape[2], : y.shape[3]]
y = y + z
x_fuse.append(self.relu(y))
return x_fuse
blocks_dict = {"BASIC": BasicBlock, "BOTTLENECK": Bottleneck}
class PoseHigherResolutionNet(Backbone):
""" PoseHigherResolutionNet
Composed of several HighResolutionModule tied together with ConvNets
Adapted from the GitHub version to fit with HRFPN and the Detectron2 infrastructure
arXiv: https://arxiv.org/abs/1908.10357
"""
def __init__(self, cfg, **kwargs):
self.inplanes = cfg.MODEL.HRNET.STEM_INPLANES
super(PoseHigherResolutionNet, self).__init__()
# stem net
self.conv1 = nn.Conv2d(3, 64, kernel_size=3, stride=2, padding=1, bias=False)
self.bn1 = nn.BatchNorm2d(64, momentum=BN_MOMENTUM)
self.conv2 = nn.Conv2d(64, 64, kernel_size=3, stride=2, padding=1, bias=False)
self.bn2 = nn.BatchNorm2d(64, momentum=BN_MOMENTUM)
self.relu = nn.ReLU(inplace=True)
self.layer1 = self._make_layer(Bottleneck, 64, 4)
self.stage2_cfg = cfg.MODEL.HRNET.STAGE2
num_channels = self.stage2_cfg.NUM_CHANNELS
block = blocks_dict[self.stage2_cfg.BLOCK]
num_channels = [num_channels[i] * block.expansion for i in range(len(num_channels))]
self.transition1 = self._make_transition_layer([256], num_channels)
self.stage2, pre_stage_channels = self._make_stage(self.stage2_cfg, num_channels)
self.stage3_cfg = cfg.MODEL.HRNET.STAGE3
num_channels = self.stage3_cfg.NUM_CHANNELS
block = blocks_dict[self.stage3_cfg.BLOCK]
num_channels = [num_channels[i] * block.expansion for i in range(len(num_channels))]
self.transition2 = self._make_transition_layer(pre_stage_channels, num_channels)
self.stage3, pre_stage_channels = self._make_stage(self.stage3_cfg, num_channels)
self.stage4_cfg = cfg.MODEL.HRNET.STAGE4
num_channels = self.stage4_cfg.NUM_CHANNELS
block = blocks_dict[self.stage4_cfg.BLOCK]
num_channels = [num_channels[i] * block.expansion for i in range(len(num_channels))]
self.transition3 = self._make_transition_layer(pre_stage_channels, num_channels)
self.stage4, pre_stage_channels = self._make_stage(
self.stage4_cfg, num_channels, multi_scale_output=True
)
self._out_features = []
self._out_feature_channels = {}
self._out_feature_strides = {}
for i in range(cfg.MODEL.HRNET.STAGE4.NUM_BRANCHES):
self._out_features.append("p%d" % (i + 1))
self._out_feature_channels.update(
{self._out_features[-1]: cfg.MODEL.HRNET.STAGE4.NUM_CHANNELS[i]}
)
self._out_feature_strides.update({self._out_features[-1]: 1})
def _get_deconv_cfg(self, deconv_kernel):
if deconv_kernel == 4:
padding = 1
output_padding = 0
elif deconv_kernel == 3:
padding = 1
output_padding = 1
elif deconv_kernel == 2:
padding = 0
output_padding = 0
return deconv_kernel, padding, output_padding
def _make_transition_layer(self, num_channels_pre_layer, num_channels_cur_layer):
num_branches_cur = len(num_channels_cur_layer)
num_branches_pre = len(num_channels_pre_layer)
transition_layers = []
for i in range(num_branches_cur):
if i < num_branches_pre:
if num_channels_cur_layer[i] != num_channels_pre_layer[i]:
transition_layers.append(
nn.Sequential(
nn.Conv2d(
num_channels_pre_layer[i],
num_channels_cur_layer[i],
3,
1,
1,
bias=False,
),
nn.BatchNorm2d(num_channels_cur_layer[i]),
nn.ReLU(inplace=True),
)
)
else:
transition_layers.append(None)
else:
conv3x3s = []
for j in range(i + 1 - num_branches_pre):
inchannels = num_channels_pre_layer[-1]
outchannels = (
num_channels_cur_layer[i] if j == i - num_branches_pre else inchannels
)
conv3x3s.append(
nn.Sequential(
nn.Conv2d(inchannels, outchannels, 3, 2, 1, bias=False),
nn.BatchNorm2d(outchannels),
nn.ReLU(inplace=True),
)
)
transition_layers.append(nn.Sequential(*conv3x3s))
return nn.ModuleList(transition_layers)
def _make_layer(self, block, planes, blocks, stride=1):
downsample = None
if stride != 1 or self.inplanes != planes * block.expansion:
downsample = nn.Sequential(
nn.Conv2d(
self.inplanes,
planes * block.expansion,
kernel_size=1,
stride=stride,
bias=False,
),
nn.BatchNorm2d(planes * block.expansion, momentum=BN_MOMENTUM),
)
layers = []
layers.append(block(self.inplanes, planes, stride, downsample))
self.inplanes = planes * block.expansion
for _ in range(1, blocks):
layers.append(block(self.inplanes, planes))
return nn.Sequential(*layers)
def _make_stage(self, layer_config, num_inchannels, multi_scale_output=True):
num_modules = layer_config["NUM_MODULES"]
num_branches = layer_config["NUM_BRANCHES"]
num_blocks = layer_config["NUM_BLOCKS"]
num_channels = layer_config["NUM_CHANNELS"]
block = blocks_dict[layer_config["BLOCK"]]
modules = []
for i in range(num_modules):
# multi_scale_output is only used last module
if not multi_scale_output and i == num_modules - 1:
reset_multi_scale_output = False
else:
reset_multi_scale_output = True
modules.append(
HighResolutionModule(
num_branches,
block,
num_blocks,
num_inchannels,
num_channels,
reset_multi_scale_output,
)
)
num_inchannels = modules[-1].get_num_inchannels()
return nn.Sequential(*modules), num_inchannels
def forward(self, x):
x = self.conv1(x)
x = self.bn1(x)
x = self.relu(x)
x = self.conv2(x)
x = self.bn2(x)
x = self.relu(x)
x = self.layer1(x)
x_list = []
for i in range(self.stage2_cfg.NUM_BRANCHES):
if self.transition1[i] is not None:
x_list.append(self.transition1[i](x))
else:
x_list.append(x)
y_list = self.stage2(x_list)
x_list = []
for i in range(self.stage3_cfg.NUM_BRANCHES):
if self.transition2[i] is not None:
x_list.append(self.transition2[i](y_list[-1]))
else:
x_list.append(y_list[i])
y_list = self.stage3(x_list)
x_list = []
for i in range(self.stage4_cfg.NUM_BRANCHES):
if self.transition3[i] is not None:
x_list.append(self.transition3[i](y_list[-1]))
else:
x_list.append(y_list[i])
y_list = self.stage4(x_list)
assert len(self._out_features) == len(y_list)
return dict(zip(self._out_features, y_list)) # final_outputs
@BACKBONE_REGISTRY.register()
def build_pose_hrnet_backbone(cfg, input_shape: ShapeSpec):
model = PoseHigherResolutionNet(cfg)
return model

View File

@ -0,0 +1,83 @@
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
from typing import List, Tuple
import torch
from detectron2.structures import Instances
from ..data.structures import DensePoseOutput
def densepose_inference(
densepose_outputs: Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor],
densepose_confidences: Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor],
detections: List[Instances],
):
"""
Infer dense pose estimate based on outputs from the DensePose head
and detections. The estimate for each detection instance is stored in its
"pred_densepose" attribute.
Args:
densepose_outputs (tuple(`torch.Tensor`)): iterable containing 4 elements:
- s (:obj: `torch.Tensor`): coarse segmentation tensor of size (N, A, H, W),
- i (:obj: `torch.Tensor`): fine segmentation tensor of size (N, C, H, W),
- u (:obj: `torch.Tensor`): U coordinates for each class of size (N, C, H, W),
- v (:obj: `torch.Tensor`): V coordinates for each class of size (N, C, H, W),
where N is the total number of detections in a batch,
A is the number of coarse segmentations labels
(e.g. 15 for coarse body parts + background),
C is the number of fine segmentation labels
(e.g. 25 for fine body parts + background),
W is the resolution along the X axis
H is the resolution along the Y axis
densepose_confidences (tuple(`torch.Tensor`)): iterable containing 4 elements:
- sigma_1 (:obj: `torch.Tensor`): global confidences for UV coordinates
of size (N, C, H, W)
- sigma_2 (:obj: `torch.Tensor`): individual confidences for UV coordinates
of size (N, C, H, W)
- kappa_u (:obj: `torch.Tensor`): first component of confidence direction
vector of size (N, C, H, W)
- kappa_v (:obj: `torch.Tensor`): second component of confidence direction
vector of size (N, C, H, W)
- fine_segm_confidence (:obj: `torch.Tensor`): confidence for fine
segmentation of size (N, 1, H, W)
- coarse_segm_confidence (:obj: `torch.Tensor`): confidence for coarse
segmentation of size (N, 1, H, W)
detections (list[Instances]): A list of N Instances, where N is the number of images
in the batch. Instances are modified by this method: "pred_densepose" attribute
is added to each instance, the attribute contains the corresponding
DensePoseOutput object.
"""
# DensePose outputs: segmentation, body part indices, U, V
s, index_uv, u, v = densepose_outputs
(
sigma_1,
sigma_2,
kappa_u,
kappa_v,
fine_segm_confidence,
coarse_segm_confidence,
) = densepose_confidences
k = 0
for detection in detections:
n_i = len(detection)
s_i = s[k : k + n_i]
index_uv_i = index_uv[k : k + n_i]
u_i = u[k : k + n_i]
v_i = v[k : k + n_i]
_local_vars = locals()
confidences = {
name: _local_vars[name][k : k + n_i]
for name in (
"sigma_1",
"sigma_2",
"kappa_u",
"kappa_v",
"fine_segm_confidence",
"coarse_segm_confidence",
)
if _local_vars.get(name) is not None
}
densepose_output_i = DensePoseOutput(s_i, index_uv_i, u_i, v_i, confidences)
detection.pred_densepose = densepose_output_i
k += n_i

View File

@ -0,0 +1,3 @@
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
from .densepose_losses import DensePoseLosses

View File

@ -0,0 +1,729 @@
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
import math
from dataclasses import dataclass
from typing import Iterable, Optional
import torch
from torch import nn
from torch.nn import functional as F
from detectron2.structures import Instances
from .. import DensePoseConfidenceModelConfig, DensePoseUVConfidenceType
def _linear_interpolation_utilities(v_norm, v0_src, size_src, v0_dst, size_dst, size_z):
"""
Computes utility values for linear interpolation at points v.
The points are given as normalized offsets in the source interval
(v0_src, v0_src + size_src), more precisely:
v = v0_src + v_norm * size_src / 256.0
The computed utilities include lower points v_lo, upper points v_hi,
interpolation weights v_w and flags j_valid indicating whether the
points falls into the destination interval (v0_dst, v0_dst + size_dst).
Args:
v_norm (:obj: `torch.Tensor`): tensor of size N containing
normalized point offsets
v0_src (:obj: `torch.Tensor`): tensor of size N containing
left bounds of source intervals for normalized points
size_src (:obj: `torch.Tensor`): tensor of size N containing
source interval sizes for normalized points
v0_dst (:obj: `torch.Tensor`): tensor of size N containing
left bounds of destination intervals
size_dst (:obj: `torch.Tensor`): tensor of size N containing
destination interval sizes
size_z (int): interval size for data to be interpolated
Returns:
v_lo (:obj: `torch.Tensor`): int tensor of size N containing
indices of lower values used for interpolation, all values are
integers from [0, size_z - 1]
v_hi (:obj: `torch.Tensor`): int tensor of size N containing
indices of upper values used for interpolation, all values are
integers from [0, size_z - 1]
v_w (:obj: `torch.Tensor`): float tensor of size N containing
interpolation weights
j_valid (:obj: `torch.Tensor`): uint8 tensor of size N containing
0 for points outside the estimation interval
(v0_est, v0_est + size_est) and 1 otherwise
"""
v = v0_src + v_norm * size_src / 256.0
j_valid = (v - v0_dst >= 0) * (v - v0_dst < size_dst)
v_grid = (v - v0_dst) * size_z / size_dst
v_lo = v_grid.floor().long().clamp(min=0, max=size_z - 1)
v_hi = (v_lo + 1).clamp(max=size_z - 1)
v_grid = torch.min(v_hi.float(), v_grid)
v_w = v_grid - v_lo.float()
return v_lo, v_hi, v_w, j_valid
class SingleTensorsHelper:
def __init__(self, proposals_with_gt):
with torch.no_grad():
(
index_uv_img,
i_with_dp,
bbox_xywh_est,
bbox_xywh_gt,
index_gt_all,
x_norm,
y_norm,
u_gt_all,
v_gt_all,
s_gt,
index_bbox,
) = _extract_single_tensors_from_matches(proposals_with_gt)
for k, v in locals().items():
if k not in ["self", "proposals_with_gt"]:
setattr(self, k, v)
class BilinearInterpolationHelper:
"""
Args:
tensors_helper (SingleTensorsHelper)
j_valid (:obj: `torch.Tensor`): uint8 tensor of size M containing
0 for points to be discarded and 1 for points to be selected
y_lo (:obj: `torch.Tensor`): int tensor of indices of upper values
in z_est for each point
y_hi (:obj: `torch.Tensor`): int tensor of indices of lower values
in z_est for each point
x_lo (:obj: `torch.Tensor`): int tensor of indices of left values
in z_est for each point
x_hi (:obj: `torch.Tensor`): int tensor of indices of right values
in z_est for each point
w_ylo_xlo (:obj: `torch.Tensor`): float tensor of size M;
contains upper-left value weight for each point
w_ylo_xhi (:obj: `torch.Tensor`): float tensor of size M;
contains upper-right value weight for each point
w_yhi_xlo (:obj: `torch.Tensor`): float tensor of size M;
contains lower-left value weight for each point
w_yhi_xhi (:obj: `torch.Tensor`): float tensor of size M;
contains lower-right value weight for each point
"""
def __init__(
self,
tensors_helper,
j_valid,
y_lo,
y_hi,
x_lo,
x_hi,
w_ylo_xlo,
w_ylo_xhi,
w_yhi_xlo,
w_yhi_xhi,
):
for k, v in locals().items():
if k != "self":
setattr(self, k, v)
@staticmethod
def from_matches(tensors_helper, densepose_outputs_size):
zh, zw = densepose_outputs_size[2], densepose_outputs_size[3]
x0_gt, y0_gt, w_gt, h_gt = tensors_helper.bbox_xywh_gt[tensors_helper.index_bbox].unbind(1)
x0_est, y0_est, w_est, h_est = tensors_helper.bbox_xywh_est[
tensors_helper.index_bbox
].unbind(dim=1)
x_lo, x_hi, x_w, jx_valid = _linear_interpolation_utilities(
tensors_helper.x_norm, x0_gt, w_gt, x0_est, w_est, zw
)
y_lo, y_hi, y_w, jy_valid = _linear_interpolation_utilities(
tensors_helper.y_norm, y0_gt, h_gt, y0_est, h_est, zh
)
j_valid = jx_valid * jy_valid
w_ylo_xlo = (1.0 - x_w) * (1.0 - y_w)
w_ylo_xhi = x_w * (1.0 - y_w)
w_yhi_xlo = (1.0 - x_w) * y_w
w_yhi_xhi = x_w * y_w
return BilinearInterpolationHelper(
tensors_helper,
j_valid,
y_lo,
y_hi,
x_lo,
x_hi,
w_ylo_xlo,
w_ylo_xhi,
w_yhi_xlo,
w_yhi_xhi,
)
def extract_at_points(
self,
z_est,
slice_index_uv=None,
w_ylo_xlo=None,
w_ylo_xhi=None,
w_yhi_xlo=None,
w_yhi_xhi=None,
):
"""
Extract ground truth values z_gt for valid point indices and estimated
values z_est using bilinear interpolation over top-left (y_lo, x_lo),
top-right (y_lo, x_hi), bottom-left (y_hi, x_lo) and bottom-right
(y_hi, x_hi) values in z_est with corresponding weights:
w_ylo_xlo, w_ylo_xhi, w_yhi_xlo and w_yhi_xhi.
Use slice_index_uv to slice dim=1 in z_est
"""
index_gt_all = self.tensors_helper.index_gt_all
slice_index_uv = index_gt_all if slice_index_uv is None else slice_index_uv
w_ylo_xlo = self.w_ylo_xlo if w_ylo_xlo is None else w_ylo_xlo
w_ylo_xhi = self.w_ylo_xhi if w_ylo_xhi is None else w_ylo_xhi
w_yhi_xlo = self.w_yhi_xlo if w_yhi_xlo is None else w_yhi_xlo
w_yhi_xhi = self.w_yhi_xhi if w_yhi_xhi is None else w_yhi_xhi
index_bbox = self.tensors_helper.index_bbox
z_est_sampled = (
z_est[index_bbox, slice_index_uv, self.y_lo, self.x_lo] * w_ylo_xlo
+ z_est[index_bbox, slice_index_uv, self.y_lo, self.x_hi] * w_ylo_xhi
+ z_est[index_bbox, slice_index_uv, self.y_hi, self.x_lo] * w_yhi_xlo
+ z_est[index_bbox, slice_index_uv, self.y_hi, self.x_hi] * w_yhi_xhi
)
return z_est_sampled
def _resample_data(
z, bbox_xywh_src, bbox_xywh_dst, wout, hout, mode="nearest", padding_mode="zeros"
):
"""
Args:
z (:obj: `torch.Tensor`): tensor of size (N,C,H,W) with data to be
resampled
bbox_xywh_src (:obj: `torch.Tensor`): tensor of size (N,4) containing
source bounding boxes in format XYWH
bbox_xywh_dst (:obj: `torch.Tensor`): tensor of size (N,4) containing
destination bounding boxes in format XYWH
Return:
zresampled (:obj: `torch.Tensor`): tensor of size (N, C, Hout, Wout)
with resampled values of z, where D is the discretization size
"""
n = bbox_xywh_src.size(0)
assert n == bbox_xywh_dst.size(0), (
"The number of "
"source ROIs for resampling ({}) should be equal to the number "
"of destination ROIs ({})".format(bbox_xywh_src.size(0), bbox_xywh_dst.size(0))
)
x0src, y0src, wsrc, hsrc = bbox_xywh_src.unbind(dim=1)
x0dst, y0dst, wdst, hdst = bbox_xywh_dst.unbind(dim=1)
x0dst_norm = 2 * (x0dst - x0src) / wsrc - 1
y0dst_norm = 2 * (y0dst - y0src) / hsrc - 1
x1dst_norm = 2 * (x0dst + wdst - x0src) / wsrc - 1
y1dst_norm = 2 * (y0dst + hdst - y0src) / hsrc - 1
grid_w = torch.arange(wout, device=z.device, dtype=torch.float) / wout
grid_h = torch.arange(hout, device=z.device, dtype=torch.float) / hout
grid_w_expanded = grid_w[None, None, :].expand(n, hout, wout)
grid_h_expanded = grid_h[None, :, None].expand(n, hout, wout)
dx_expanded = (x1dst_norm - x0dst_norm)[:, None, None].expand(n, hout, wout)
dy_expanded = (y1dst_norm - y0dst_norm)[:, None, None].expand(n, hout, wout)
x0_expanded = x0dst_norm[:, None, None].expand(n, hout, wout)
y0_expanded = y0dst_norm[:, None, None].expand(n, hout, wout)
grid_x = grid_w_expanded * dx_expanded + x0_expanded
grid_y = grid_h_expanded * dy_expanded + y0_expanded
grid = torch.stack((grid_x, grid_y), dim=3)
# resample Z from (N, C, H, W) into (N, C, Hout, Wout)
zresampled = F.grid_sample(z, grid, mode=mode, padding_mode=padding_mode, align_corners=True)
return zresampled
def _extract_single_tensors_from_matches_one_image(
proposals_targets, bbox_with_dp_offset, bbox_global_offset
):
i_gt_all = []
x_norm_all = []
y_norm_all = []
u_gt_all = []
v_gt_all = []
s_gt_all = []
bbox_xywh_gt_all = []
bbox_xywh_est_all = []
# Ibbox_all == k should be true for all data that corresponds
# to bbox_xywh_gt[k] and bbox_xywh_est[k]
# index k here is global wrt images
i_bbox_all = []
# at offset k (k is global) contains index of bounding box data
# within densepose output tensor
i_with_dp = []
boxes_xywh_est = proposals_targets.proposal_boxes.clone()
boxes_xywh_gt = proposals_targets.gt_boxes.clone()
n_i = len(boxes_xywh_est)
assert n_i == len(boxes_xywh_gt)
if n_i:
boxes_xywh_est.tensor[:, 2] -= boxes_xywh_est.tensor[:, 0]
boxes_xywh_est.tensor[:, 3] -= boxes_xywh_est.tensor[:, 1]
boxes_xywh_gt.tensor[:, 2] -= boxes_xywh_gt.tensor[:, 0]
boxes_xywh_gt.tensor[:, 3] -= boxes_xywh_gt.tensor[:, 1]
if hasattr(proposals_targets, "gt_densepose"):
densepose_gt = proposals_targets.gt_densepose
for k, box_xywh_est, box_xywh_gt, dp_gt in zip(
range(n_i), boxes_xywh_est.tensor, boxes_xywh_gt.tensor, densepose_gt
):
if (dp_gt is not None) and (len(dp_gt.x) > 0):
i_gt_all.append(dp_gt.i)
x_norm_all.append(dp_gt.x)
y_norm_all.append(dp_gt.y)
u_gt_all.append(dp_gt.u)
v_gt_all.append(dp_gt.v)
s_gt_all.append(dp_gt.segm.unsqueeze(0))
bbox_xywh_gt_all.append(box_xywh_gt.view(-1, 4))
bbox_xywh_est_all.append(box_xywh_est.view(-1, 4))
i_bbox_k = torch.full_like(dp_gt.i, bbox_with_dp_offset + len(i_with_dp))
i_bbox_all.append(i_bbox_k)
i_with_dp.append(bbox_global_offset + k)
return (
i_gt_all,
x_norm_all,
y_norm_all,
u_gt_all,
v_gt_all,
s_gt_all,
bbox_xywh_gt_all,
bbox_xywh_est_all,
i_bbox_all,
i_with_dp,
)
def _extract_single_tensors_from_matches(proposals_with_targets):
i_img = []
i_gt_all = []
x_norm_all = []
y_norm_all = []
u_gt_all = []
v_gt_all = []
s_gt_all = []
bbox_xywh_gt_all = []
bbox_xywh_est_all = []
i_bbox_all = []
i_with_dp_all = []
n = 0
for i, proposals_targets_per_image in enumerate(proposals_with_targets):
n_i = proposals_targets_per_image.proposal_boxes.tensor.size(0)
if not n_i:
continue
(
i_gt_img,
x_norm_img,
y_norm_img,
u_gt_img,
v_gt_img,
s_gt_img,
bbox_xywh_gt_img,
bbox_xywh_est_img,
i_bbox_img,
i_with_dp_img,
) = _extract_single_tensors_from_matches_one_image( # noqa
proposals_targets_per_image, len(i_with_dp_all), n
)
i_gt_all.extend(i_gt_img)
x_norm_all.extend(x_norm_img)
y_norm_all.extend(y_norm_img)
u_gt_all.extend(u_gt_img)
v_gt_all.extend(v_gt_img)
s_gt_all.extend(s_gt_img)
bbox_xywh_gt_all.extend(bbox_xywh_gt_img)
bbox_xywh_est_all.extend(bbox_xywh_est_img)
i_bbox_all.extend(i_bbox_img)
i_with_dp_all.extend(i_with_dp_img)
i_img.extend([i] * len(i_with_dp_img))
n += n_i
# concatenate all data into a single tensor
if (n > 0) and (len(i_with_dp_all) > 0):
i_gt = torch.cat(i_gt_all, 0).long()
x_norm = torch.cat(x_norm_all, 0)
y_norm = torch.cat(y_norm_all, 0)
u_gt = torch.cat(u_gt_all, 0)
v_gt = torch.cat(v_gt_all, 0)
s_gt = torch.cat(s_gt_all, 0)
bbox_xywh_gt = torch.cat(bbox_xywh_gt_all, 0)
bbox_xywh_est = torch.cat(bbox_xywh_est_all, 0)
i_bbox = torch.cat(i_bbox_all, 0).long()
else:
i_gt = None
x_norm = None
y_norm = None
u_gt = None
v_gt = None
s_gt = None
bbox_xywh_gt = None
bbox_xywh_est = None
i_bbox = None
return (
i_img,
i_with_dp_all,
bbox_xywh_est,
bbox_xywh_gt,
i_gt,
x_norm,
y_norm,
u_gt,
v_gt,
s_gt,
i_bbox,
)
@dataclass
class DataForMaskLoss:
"""
Contains mask GT and estimated data for proposals from multiple images:
"""
# tensor of size (K, H, W) containing GT labels
masks_gt: Optional[torch.Tensor] = None
# tensor of size (K, C, H, W) containing estimated scores
masks_est: Optional[torch.Tensor] = None
def _extract_data_for_mask_loss_from_matches(
proposals_targets: Iterable[Instances], estimated_segm: torch.Tensor
) -> DataForMaskLoss:
"""
Extract data for mask loss from instances that contain matched GT and
estimated bounding boxes.
Args:
proposals_targets: Iterable[Instances]
matched GT and estimated results, each item in the iterable
corresponds to data in 1 image
estimated_segm: torch.Tensor if size
size to which GT masks are resized
Return:
masks_est: tensor(K, C, H, W) of float - class scores
masks_gt: tensor(K, H, W) of int64 - labels
"""
data = DataForMaskLoss()
masks_gt = []
offset = 0
assert estimated_segm.shape[2] == estimated_segm.shape[3], (
f"Expected estimated segmentation to have a square shape, "
f"but the actual shape is {estimated_segm.shape[2:]}"
)
mask_size = estimated_segm.shape[2]
num_proposals = sum(inst.proposal_boxes.tensor.size(0) for inst in proposals_targets)
num_estimated = estimated_segm.shape[0]
assert (
num_proposals == num_estimated
), "The number of proposals {} must be equal to the number of estimates {}".format(
num_proposals, num_estimated
)
for proposals_targets_per_image in proposals_targets:
n_i = proposals_targets_per_image.proposal_boxes.tensor.size(0)
if not n_i:
continue
gt_masks_per_image = proposals_targets_per_image.gt_masks.crop_and_resize(
proposals_targets_per_image.proposal_boxes.tensor, mask_size
).to(device=estimated_segm.device)
masks_gt.append(gt_masks_per_image)
offset += n_i
if masks_gt:
data.masks_est = estimated_segm
data.masks_gt = torch.cat(masks_gt, dim=0)
return data
class IIDIsotropicGaussianUVLoss(nn.Module):
"""
Loss for the case of iid residuals with isotropic covariance:
$Sigma_i = sigma_i^2 I$
The loss (negative log likelihood) is then:
$1/2 sum_{i=1}^n (log(2 pi) + 2 log sigma_i^2 + ||delta_i||^2 / sigma_i^2)$,
where $delta_i=(u - u', v - v')$ is a 2D vector containing UV coordinates
difference between estimated and ground truth UV values
For details, see:
N. Neverova, D. Novotny, A. Vedaldi "Correlated Uncertainty for Learning
Dense Correspondences from Noisy Labels", p. 918--926, in Proc. NIPS 2019
"""
def __init__(self, sigma_lower_bound: float):
super(IIDIsotropicGaussianUVLoss, self).__init__()
self.sigma_lower_bound = sigma_lower_bound
self.log2pi = math.log(2 * math.pi)
def forward(
self,
u: torch.Tensor,
v: torch.Tensor,
sigma_u: torch.Tensor,
target_u: torch.Tensor,
target_v: torch.Tensor,
):
# compute $\sigma_i^2$
# use sigma_lower_bound to avoid degenerate solution for variance
# (sigma -> 0)
sigma2 = F.softplus(sigma_u) + self.sigma_lower_bound
# compute \|delta_i\|^2
delta_t_delta = (u - target_u) ** 2 + (v - target_v) ** 2
# the total loss from the formula above:
loss = 0.5 * (self.log2pi + 2 * torch.log(sigma2) + delta_t_delta / sigma2)
return loss.sum()
class IndepAnisotropicGaussianUVLoss(nn.Module):
"""
Loss for the case of independent residuals with anisotropic covariances:
$Sigma_i = sigma_i^2 I + r_i r_i^T$
The loss (negative log likelihood) is then:
$1/2 sum_{i=1}^n (log(2 pi)
+ log sigma_i^2 (sigma_i^2 + ||r_i||^2)
+ ||delta_i||^2 / sigma_i^2
- <delta_i, r_i>^2 / (sigma_i^2 * (sigma_i^2 + ||r_i||^2)))$,
where $delta_i=(u - u', v - v')$ is a 2D vector containing UV coordinates
difference between estimated and ground truth UV values
For details, see:
N. Neverova, D. Novotny, A. Vedaldi "Correlated Uncertainty for Learning
Dense Correspondences from Noisy Labels", p. 918--926, in Proc. NIPS 2019
"""
def __init__(self, sigma_lower_bound: float):
super(IndepAnisotropicGaussianUVLoss, self).__init__()
self.sigma_lower_bound = sigma_lower_bound
self.log2pi = math.log(2 * math.pi)
def forward(
self,
u: torch.Tensor,
v: torch.Tensor,
sigma_u: torch.Tensor,
kappa_u_est: torch.Tensor,
kappa_v_est: torch.Tensor,
target_u: torch.Tensor,
target_v: torch.Tensor,
):
# compute $\sigma_i^2$
sigma2 = F.softplus(sigma_u) + self.sigma_lower_bound
# compute \|r_i\|^2
r_sqnorm2 = kappa_u_est ** 2 + kappa_v_est ** 2
delta_u = u - target_u
delta_v = v - target_v
# compute \|delta_i\|^2
delta_sqnorm = delta_u ** 2 + delta_v ** 2
delta_u_r_u = delta_u * kappa_u_est
delta_v_r_v = delta_v * kappa_v_est
# compute the scalar product <delta_i, r_i>
delta_r = delta_u_r_u + delta_v_r_v
# compute squared scalar product <delta_i, r_i>^2
delta_r_sqnorm = delta_r ** 2
denom2 = sigma2 * (sigma2 + r_sqnorm2)
loss = 0.5 * (
self.log2pi + torch.log(denom2) + delta_sqnorm / sigma2 - delta_r_sqnorm / denom2
)
return loss.sum()
class DensePoseLosses(object):
def __init__(self, cfg):
# fmt: off
self.heatmap_size = cfg.MODEL.ROI_DENSEPOSE_HEAD.HEATMAP_SIZE
self.w_points = cfg.MODEL.ROI_DENSEPOSE_HEAD.POINT_REGRESSION_WEIGHTS
self.w_part = cfg.MODEL.ROI_DENSEPOSE_HEAD.PART_WEIGHTS
self.w_segm = cfg.MODEL.ROI_DENSEPOSE_HEAD.INDEX_WEIGHTS
self.n_segm_chan = cfg.MODEL.ROI_DENSEPOSE_HEAD.NUM_COARSE_SEGM_CHANNELS
# fmt: on
self.segm_trained_by_masks = cfg.MODEL.ROI_DENSEPOSE_HEAD.COARSE_SEGM_TRAINED_BY_MASKS
self.confidence_model_cfg = DensePoseConfidenceModelConfig.from_cfg(cfg)
if self.confidence_model_cfg.uv_confidence.type == DensePoseUVConfidenceType.IID_ISO:
self.uv_loss_with_confidences = IIDIsotropicGaussianUVLoss(
self.confidence_model_cfg.uv_confidence.epsilon
)
elif self.confidence_model_cfg.uv_confidence.type == DensePoseUVConfidenceType.INDEP_ANISO:
self.uv_loss_with_confidences = IndepAnisotropicGaussianUVLoss(
self.confidence_model_cfg.uv_confidence.epsilon
)
def __call__(self, proposals_with_gt, densepose_outputs, densepose_confidences):
if not self.segm_trained_by_masks:
return self.produce_densepose_losses(
proposals_with_gt, densepose_outputs, densepose_confidences
)
else:
losses = {}
losses_densepose = self.produce_densepose_losses(
proposals_with_gt, densepose_outputs, densepose_confidences
)
losses.update(losses_densepose)
losses_mask = self.produce_mask_losses(
proposals_with_gt, densepose_outputs, densepose_confidences
)
losses.update(losses_mask)
return losses
def produce_fake_mask_losses(self, densepose_outputs):
losses = {}
segm_scores, _, _, _ = densepose_outputs
losses["loss_densepose_S"] = segm_scores.sum() * 0
return losses
def produce_mask_losses(self, proposals_with_gt, densepose_outputs, densepose_confidences):
if not len(proposals_with_gt):
return self.produce_fake_mask_losses(densepose_outputs)
losses = {}
# densepose outputs are computed for all images and all bounding boxes;
# i.e. if a batch has 4 images with (3, 1, 2, 1) proposals respectively,
# the outputs will have size(0) == 3+1+2+1 == 7
segm_scores, _, _, _ = densepose_outputs
with torch.no_grad():
mask_loss_data = _extract_data_for_mask_loss_from_matches(
proposals_with_gt, segm_scores
)
if (mask_loss_data.masks_gt is None) or (mask_loss_data.masks_est is None):
return self.produce_fake_mask_losses(densepose_outputs)
losses["loss_densepose_S"] = (
F.cross_entropy(mask_loss_data.masks_est, mask_loss_data.masks_gt.long()) * self.w_segm
)
return losses
def produce_fake_densepose_losses(self, densepose_outputs, densepose_confidences):
# we need to keep the same computation graph on all the GPUs to
# perform reduction properly. Hence even if we have no data on one
# of the GPUs, we still need to generate the computation graph.
# Add fake (zero) losses in the form Tensor.sum() * 0
s, index_uv, u, v = densepose_outputs
conf_type = self.confidence_model_cfg.uv_confidence.type
(
sigma_1,
sigma_2,
kappa_u,
kappa_v,
fine_segm_confidence,
coarse_segm_confidence,
) = densepose_confidences
losses = {}
losses["loss_densepose_I"] = index_uv.sum() * 0
if not self.segm_trained_by_masks:
losses["loss_densepose_S"] = s.sum() * 0
if self.confidence_model_cfg.uv_confidence.enabled:
losses["loss_densepose_UV"] = (u.sum() + v.sum()) * 0
if conf_type == DensePoseUVConfidenceType.IID_ISO:
losses["loss_densepose_UV"] += sigma_2.sum() * 0
elif conf_type == DensePoseUVConfidenceType.INDEP_ANISO:
losses["loss_densepose_UV"] += (sigma_2.sum() + kappa_u.sum() + kappa_v.sum()) * 0
else:
losses["loss_densepose_U"] = u.sum() * 0
losses["loss_densepose_V"] = v.sum() * 0
return losses
def produce_densepose_losses(self, proposals_with_gt, densepose_outputs, densepose_confidences):
losses = {}
# densepose outputs are computed for all images and all bounding boxes;
# i.e. if a batch has 4 images with (3, 1, 2, 1) proposals respectively,
# the outputs will have size(0) == 3+1+2+1 == 7
s, index_uv, u, v = densepose_outputs
assert u.size(2) == v.size(2)
assert u.size(3) == v.size(3)
assert u.size(2) == index_uv.size(2)
assert u.size(3) == index_uv.size(3)
densepose_outputs_size = u.size()
if not len(proposals_with_gt):
return self.produce_fake_densepose_losses(densepose_outputs, densepose_confidences)
(
sigma_1,
sigma_2,
kappa_u,
kappa_v,
fine_segm_confidence,
coarse_segm_confidence,
) = densepose_confidences
conf_type = self.confidence_model_cfg.uv_confidence.type
tensors_helper = SingleTensorsHelper(proposals_with_gt)
n_batch = len(tensors_helper.i_with_dp)
# NOTE: we need to keep the same computation graph on all the GPUs to
# perform reduction properly. Hence even if we have no data on one
# of the GPUs, we still need to generate the computation graph.
# Add fake (zero) loss in the form Tensor.sum() * 0
if not n_batch:
return self.produce_fake_densepose_losses(densepose_outputs, densepose_confidences)
interpolator = BilinearInterpolationHelper.from_matches(
tensors_helper, densepose_outputs_size
)
j_valid_fg = interpolator.j_valid * (tensors_helper.index_gt_all > 0)
u_gt = tensors_helper.u_gt_all[j_valid_fg]
u_est_all = interpolator.extract_at_points(u[tensors_helper.i_with_dp])
u_est = u_est_all[j_valid_fg]
v_gt = tensors_helper.v_gt_all[j_valid_fg]
v_est_all = interpolator.extract_at_points(v[tensors_helper.i_with_dp])
v_est = v_est_all[j_valid_fg]
index_uv_gt = tensors_helper.index_gt_all[interpolator.j_valid]
index_uv_est_all = interpolator.extract_at_points(
index_uv[tensors_helper.i_with_dp],
slice_index_uv=slice(None),
w_ylo_xlo=interpolator.w_ylo_xlo[:, None],
w_ylo_xhi=interpolator.w_ylo_xhi[:, None],
w_yhi_xlo=interpolator.w_yhi_xlo[:, None],
w_yhi_xhi=interpolator.w_yhi_xhi[:, None],
)
index_uv_est = index_uv_est_all[interpolator.j_valid, :]
if self.confidence_model_cfg.uv_confidence.enabled:
sigma_2_est_all = interpolator.extract_at_points(sigma_2[tensors_helper.i_with_dp])
sigma_2_est = sigma_2_est_all[j_valid_fg]
if conf_type in [DensePoseUVConfidenceType.INDEP_ANISO]:
kappa_u_est_all = interpolator.extract_at_points(kappa_u[tensors_helper.i_with_dp])
kappa_u_est = kappa_u_est_all[j_valid_fg]
kappa_v_est_all = interpolator.extract_at_points(kappa_v[tensors_helper.i_with_dp])
kappa_v_est = kappa_v_est_all[j_valid_fg]
# Resample everything to the estimated data size, no need to resample
# S_est then:
if not self.segm_trained_by_masks:
s_est = s[tensors_helper.i_with_dp]
with torch.no_grad():
s_gt = _resample_data(
tensors_helper.s_gt.unsqueeze(1),
tensors_helper.bbox_xywh_gt,
tensors_helper.bbox_xywh_est,
self.heatmap_size,
self.heatmap_size,
mode="nearest",
padding_mode="zeros",
).squeeze(1)
# add point-based losses:
if self.confidence_model_cfg.uv_confidence.enabled:
if conf_type == DensePoseUVConfidenceType.IID_ISO:
uv_loss = (
self.uv_loss_with_confidences(u_est, v_est, sigma_2_est, u_gt, v_gt)
* self.w_points
)
losses["loss_densepose_UV"] = uv_loss
elif conf_type == DensePoseUVConfidenceType.INDEP_ANISO:
uv_loss = (
self.uv_loss_with_confidences(
u_est, v_est, sigma_2_est, kappa_u_est, kappa_v_est, u_gt, v_gt
)
* self.w_points
)
losses["loss_densepose_UV"] = uv_loss
else:
raise ValueError(f"Unknown confidence model type: {conf_type}")
else:
u_loss = F.smooth_l1_loss(u_est, u_gt, reduction="sum") * self.w_points
losses["loss_densepose_U"] = u_loss
v_loss = F.smooth_l1_loss(v_est, v_gt, reduction="sum") * self.w_points
losses["loss_densepose_V"] = v_loss
index_uv_loss = F.cross_entropy(index_uv_est, index_uv_gt.long()) * self.w_part
losses["loss_densepose_I"] = index_uv_loss
if not self.segm_trained_by_masks:
if self.n_segm_chan == 2:
s_gt = s_gt > 0
s_loss = F.cross_entropy(s_est, s_gt.long()) * self.w_segm
losses["loss_densepose_S"] = s_loss
return losses

View File

@ -0,0 +1,5 @@
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
from .chart import DensePoseChartPredictor
from .chart_confidence import DensePoseChartConfidencePredictorMixin
from .chart_with_confidence import DensePoseChartWithConfidencePredictor

View File

@ -0,0 +1,102 @@
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
import torch
from torch import nn
from detectron2.config import CfgNode
from detectron2.layers import ConvTranspose2d, interpolate
from ..utils import initialize_module_params
class DensePoseChartPredictor(nn.Module):
"""
Predictor (last layers of a DensePose model) that takes DensePose head outputs as an input
and produces 4 tensors which represent DensePose results for predefined body parts
(patches / charts):
- coarse segmentation [N, K, H, W]
- fine segmentation [N, C, H, W]
- U coordinates [N, C, H, W]
- V coordinates [N, C, H, W]
where
- N is the number of instances
- K is the number of coarse segmentation channels (
2 = foreground / background,
15 = one of 14 body parts / background)
- C is the number of fine segmentation channels (
24 fine body parts / background)
- H and W are height and width of predictions
"""
def __init__(self, cfg: CfgNode, input_channels: int):
"""
Initialize predictor using configuration options
Args:
cfg (CfgNode): configuration options
input_channels (int): input tensor size along the channel dimension
"""
super().__init__()
dim_in = input_channels
n_segm_chan = cfg.MODEL.ROI_DENSEPOSE_HEAD.NUM_COARSE_SEGM_CHANNELS
dim_out_patches = cfg.MODEL.ROI_DENSEPOSE_HEAD.NUM_PATCHES + 1
kernel_size = cfg.MODEL.ROI_DENSEPOSE_HEAD.DECONV_KERNEL
self.ann_index_lowres = ConvTranspose2d(
dim_in, n_segm_chan, kernel_size, stride=2, padding=int(kernel_size / 2 - 1)
)
self.index_uv_lowres = ConvTranspose2d(
dim_in, dim_out_patches, kernel_size, stride=2, padding=int(kernel_size / 2 - 1)
)
self.u_lowres = ConvTranspose2d(
dim_in, dim_out_patches, kernel_size, stride=2, padding=int(kernel_size / 2 - 1)
)
self.v_lowres = ConvTranspose2d(
dim_in, dim_out_patches, kernel_size, stride=2, padding=int(kernel_size / 2 - 1)
)
self.scale_factor = cfg.MODEL.ROI_DENSEPOSE_HEAD.UP_SCALE
initialize_module_params(self)
def interp2d(self, tensor_nchw: torch.Tensor):
"""
Bilinear interpolation method to be used for upscaling
Args:
tensor_nchw (tensor): tensor of shape (N, C, H, W)
Return:
tensor of shape (N, C, Hout, Wout), where Hout and Wout are computed
by applying the scale factor to H and W
"""
return interpolate(
tensor_nchw, scale_factor=self.scale_factor, mode="bilinear", align_corners=False
)
def forward(self, head_outputs: torch.Tensor):
"""
Perform forward step on DensePose head outputs
Args:
head_outputs (tensor): DensePose head outputs, tensor of shape [N, D, H, W]
Return:
- a tuple of 4 tensors containing DensePose predictions for charts:
* coarse segmentation estimate, a tensor of shape [N, K, Hout, Wout]
* fine segmentation estimate, a tensor of shape [N, C, Hout, Wout]
* U coordinates, a tensor of shape [N, C, Hout, Wout]
* V coordinates, a tensor of shape [N, C, Hout, Wout]
- a tuple of 4 tensors containing DensePose predictions for charts at reduced resolution:
* coarse segmentation estimate, a tensor of shape [N, K, Hout / 2, Wout / 2]
* fine segmentation estimate, a tensor of shape [N, C, Hout / 2, Wout / 2]
* U coordinates, a tensor of shape [N, C, Hout / 2, Wout / 2]
* V coordinates, a tensor of shape [N, C, Hout / 2, Wout / 2]
"""
coarse_segm_lowres = self.ann_index_lowres(head_outputs)
fine_segm_lowres = self.index_uv_lowres(head_outputs)
u_lowres = self.u_lowres(head_outputs)
v_lowres = self.v_lowres(head_outputs)
coarse_segm = self.interp2d(coarse_segm_lowres)
fine_segm = self.interp2d(fine_segm_lowres)
u = self.interp2d(u_lowres)
v = self.interp2d(v_lowres)
siuv = (coarse_segm, fine_segm, u, v)
siuv_lowres = (coarse_segm_lowres, fine_segm_lowres, u_lowres, v_lowres)
return siuv, siuv_lowres

View File

@ -0,0 +1,176 @@
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
import torch
from torch.nn import functional as F
from detectron2.config import CfgNode
from detectron2.layers import ConvTranspose2d
from ..confidence import DensePoseConfidenceModelConfig, DensePoseUVConfidenceType
from ..utils import initialize_module_params
class DensePoseChartConfidencePredictorMixin:
"""
Predictor contains the last layers of a DensePose model that take DensePose head
outputs as an input and produce model outputs. Confidence predictor mixin is used
to generate confidences for segmentation and UV tensors estimated by some
base predictor. Several assumptions need to hold for the base predictor:
1) the `forward` method must return SIUV tuple as the first result (
S = coarse segmentation, I = fine segmentation, U and V are intrinsic
chart coordinates)
2) `interp2d` method must be defined to perform bilinear interpolation;
the same method is typically used for SIUV and confidences
Confidence predictor mixin provides confidence estimates, as described in:
N. Neverova et al., Correlated Uncertainty for Learning Dense Correspondences
from Noisy Labels, NeurIPS 2019
A. Sanakoyeu et al., Transferring Dense Pose to Proximal Animal Classes, CVPR 2020
"""
def __init__(self, cfg: CfgNode, input_channels: int):
"""
Initialize confidence predictor using configuration options.
Args:
cfg (CfgNode): configuration options
input_channels (int): number of input channels
"""
# we rely on base predictor to call nn.Module.__init__
super().__init__(cfg, input_channels)
self.confidence_model_cfg = DensePoseConfidenceModelConfig.from_cfg(cfg)
self._initialize_confidence_estimation_layers(cfg, input_channels)
initialize_module_params(self)
def _initialize_confidence_estimation_layers(self, cfg: CfgNode, dim_in: int):
"""
Initialize confidence estimation layers based on configuration options
Args:
cfg (CfgNode): configuration options
dim_in (int): number of input channels
"""
dim_out_patches = cfg.MODEL.ROI_DENSEPOSE_HEAD.NUM_PATCHES + 1
kernel_size = cfg.MODEL.ROI_DENSEPOSE_HEAD.DECONV_KERNEL
if self.confidence_model_cfg.uv_confidence.enabled:
if self.confidence_model_cfg.uv_confidence.type == DensePoseUVConfidenceType.IID_ISO:
self.sigma_2_lowres = ConvTranspose2d(
dim_in, dim_out_patches, kernel_size, stride=2, padding=int(kernel_size / 2 - 1)
)
elif (
self.confidence_model_cfg.uv_confidence.type
== DensePoseUVConfidenceType.INDEP_ANISO
):
self.sigma_2_lowres = ConvTranspose2d(
dim_in, dim_out_patches, kernel_size, stride=2, padding=int(kernel_size / 2 - 1)
)
self.kappa_u_lowres = ConvTranspose2d(
dim_in, dim_out_patches, kernel_size, stride=2, padding=int(kernel_size / 2 - 1)
)
self.kappa_v_lowres = ConvTranspose2d(
dim_in, dim_out_patches, kernel_size, stride=2, padding=int(kernel_size / 2 - 1)
)
else:
raise ValueError(
f"Unknown confidence model type: "
f"{self.confidence_model_cfg.confidence_model_type}"
)
if self.confidence_model_cfg.segm_confidence.enabled:
self.fine_segm_confidence_lowres = ConvTranspose2d(
dim_in, 1, kernel_size, stride=2, padding=int(kernel_size / 2 - 1)
)
self.coarse_segm_confidence_lowres = ConvTranspose2d(
dim_in, 1, kernel_size, stride=2, padding=int(kernel_size / 2 - 1)
)
def forward(self, head_outputs: torch.Tensor):
"""
Perform forward operation on head outputs used as inputs for the predictor.
Calls forward method from the base predictor and uses its outputs to compute
confidences.
Args:
head_outputs (Tensor): head outputs used as predictor inputs
Return:
A tuple containing the following entries:
- SIUV tuple with possibly modified segmentation tensors
- various other outputs from the base predictor
- 6 tensors with estimated confidence model parameters at full resolution
(sigma_1, sigma_2, kappa_u, kappa_v, fine_segm_confidence, coarse_segm_confidence)
- 6 tensors with estimated confidence model parameters at half resolution
(sigma_1, sigma_2, kappa_u, kappa_v, fine_segm_confidence, coarse_segm_confidence)
"""
# assuming base class returns SIUV estimates in its first result
base_predictor_outputs = super().forward(head_outputs)
siuv = (
base_predictor_outputs[0]
if isinstance(base_predictor_outputs, tuple)
else base_predictor_outputs
)
coarse_segm, fine_segm, u, v = siuv
sigma_1, sigma_2, kappa_u, kappa_v = None, None, None, None
sigma_1_lowres, sigma_2_lowres, kappa_u_lowres, kappa_v_lowres = None, None, None, None
fine_segm_confidence_lowres, fine_segm_confidence = None, None
coarse_segm_confidence_lowres, coarse_segm_confidence = None, None
if self.confidence_model_cfg.uv_confidence.enabled:
if self.confidence_model_cfg.uv_confidence.type == DensePoseUVConfidenceType.IID_ISO:
sigma_2_lowres = self.sigma_2_lowres(head_outputs)
# assuming base class defines interp2d method for bilinear interpolation
sigma_2 = self.interp2d(sigma_2_lowres)
elif (
self.confidence_model_cfg.uv_confidence.type
== DensePoseUVConfidenceType.INDEP_ANISO
):
sigma_2_lowres = self.sigma_2_lowres(head_outputs)
kappa_u_lowres = self.kappa_u_lowres(head_outputs)
kappa_v_lowres = self.kappa_v_lowres(head_outputs)
# assuming base class defines interp2d method for bilinear interpolation
sigma_2 = self.interp2d(sigma_2_lowres)
kappa_u = self.interp2d(kappa_u_lowres)
kappa_v = self.interp2d(kappa_v_lowres)
else:
raise ValueError(
f"Unknown confidence model type: "
f"{self.confidence_model_cfg.confidence_model_type}"
)
if self.confidence_model_cfg.segm_confidence.enabled:
fine_segm_confidence_lowres = self.fine_segm_confidence_lowres(head_outputs)
# assuming base class defines interp2d method for bilinear interpolation
fine_segm_confidence = self.interp2d(fine_segm_confidence_lowres)
fine_segm_confidence = (
F.softplus(fine_segm_confidence) + self.confidence_model_cfg.segm_confidence.epsilon
)
fine_segm = fine_segm * torch.repeat_interleave(
fine_segm_confidence, fine_segm.shape[1], dim=1
)
coarse_segm_confidence_lowres = self.coarse_segm_confidence_lowres(head_outputs)
# assuming base class defines interp2d method for bilinear interpolation
coarse_segm_confidence = self.interp2d(coarse_segm_confidence_lowres)
coarse_segm_confidence = (
F.softplus(coarse_segm_confidence)
+ self.confidence_model_cfg.segm_confidence.epsilon
)
coarse_segm = coarse_segm * torch.repeat_interleave(
coarse_segm_confidence, coarse_segm.shape[1], dim=1
)
results = []
# append SIUV with possibly modified segmentation tensors
results.append((coarse_segm, fine_segm, u, v))
# append the rest of base predictor outputs
if isinstance(base_predictor_outputs, tuple):
results.extend(base_predictor_outputs[1:])
# append hi-res confidence estimates
results.append(
(sigma_1, sigma_2, kappa_u, kappa_v, fine_segm_confidence, coarse_segm_confidence)
)
# append lo-res confidence estimates
results.append(
(
sigma_1_lowres,
sigma_2_lowres,
kappa_u_lowres,
kappa_v_lowres,
fine_segm_confidence_lowres,
coarse_segm_confidence_lowres,
)
)
return tuple(results)

View File

@ -0,0 +1,13 @@
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
from . import DensePoseChartConfidencePredictorMixin, DensePoseChartPredictor
class DensePoseChartWithConfidencePredictor(
DensePoseChartConfidencePredictorMixin, DensePoseChartPredictor
):
"""
Predictor that combines chart and chart confidence estimation
"""
pass

View File

@ -0,0 +1,263 @@
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
import fvcore.nn.weight_init as weight_init
import torch
from torch import nn
from torch.nn import functional as F
from detectron2.config import CfgNode
from detectron2.layers import Conv2d
from .registry import ROI_DENSEPOSE_HEAD_REGISTRY
@ROI_DENSEPOSE_HEAD_REGISTRY.register()
class DensePoseDeepLabHead(nn.Module):
"""
DensePose head using DeepLabV3 model from
"Rethinking Atrous Convolution for Semantic Image Segmentation"
<https://arxiv.org/abs/1706.05587>.
"""
def __init__(self, cfg: CfgNode, input_channels: int):
super(DensePoseDeepLabHead, self).__init__()
# fmt: off
hidden_dim = cfg.MODEL.ROI_DENSEPOSE_HEAD.CONV_HEAD_DIM
kernel_size = cfg.MODEL.ROI_DENSEPOSE_HEAD.CONV_HEAD_KERNEL
norm = cfg.MODEL.ROI_DENSEPOSE_HEAD.DEEPLAB.NORM
self.n_stacked_convs = cfg.MODEL.ROI_DENSEPOSE_HEAD.NUM_STACKED_CONVS
self.use_nonlocal = cfg.MODEL.ROI_DENSEPOSE_HEAD.DEEPLAB.NONLOCAL_ON
# fmt: on
pad_size = kernel_size // 2
n_channels = input_channels
self.ASPP = ASPP(input_channels, [6, 12, 56], n_channels) # 6, 12, 56
self.add_module("ASPP", self.ASPP)
if self.use_nonlocal:
self.NLBlock = NONLocalBlock2D(input_channels, bn_layer=True)
self.add_module("NLBlock", self.NLBlock)
# weight_init.c2_msra_fill(self.ASPP)
for i in range(self.n_stacked_convs):
norm_module = nn.GroupNorm(32, hidden_dim) if norm == "GN" else None
layer = Conv2d(
n_channels,
hidden_dim,
kernel_size,
stride=1,
padding=pad_size,
bias=not norm,
norm=norm_module,
)
weight_init.c2_msra_fill(layer)
n_channels = hidden_dim
layer_name = self._get_layer_name(i)
self.add_module(layer_name, layer)
self.n_out_channels = hidden_dim
# initialize_module_params(self)
def forward(self, features):
x0 = features
x = self.ASPP(x0)
if self.use_nonlocal:
x = self.NLBlock(x)
output = x
for i in range(self.n_stacked_convs):
layer_name = self._get_layer_name(i)
x = getattr(self, layer_name)(x)
x = F.relu(x)
output = x
return output
def _get_layer_name(self, i: int):
layer_name = "body_conv_fcn{}".format(i + 1)
return layer_name
# Copied from
# https://github.com/pytorch/vision/blob/master/torchvision/models/segmentation/deeplabv3.py
# See https://arxiv.org/pdf/1706.05587.pdf for details
class ASPPConv(nn.Sequential):
def __init__(self, in_channels, out_channels, dilation):
modules = [
nn.Conv2d(
in_channels, out_channels, 3, padding=dilation, dilation=dilation, bias=False
),
nn.GroupNorm(32, out_channels),
nn.ReLU(),
]
super(ASPPConv, self).__init__(*modules)
class ASPPPooling(nn.Sequential):
def __init__(self, in_channels, out_channels):
super(ASPPPooling, self).__init__(
nn.AdaptiveAvgPool2d(1),
nn.Conv2d(in_channels, out_channels, 1, bias=False),
nn.GroupNorm(32, out_channels),
nn.ReLU(),
)
def forward(self, x):
size = x.shape[-2:]
x = super(ASPPPooling, self).forward(x)
return F.interpolate(x, size=size, mode="bilinear", align_corners=False)
class ASPP(nn.Module):
def __init__(self, in_channels, atrous_rates, out_channels):
super(ASPP, self).__init__()
modules = []
modules.append(
nn.Sequential(
nn.Conv2d(in_channels, out_channels, 1, bias=False),
nn.GroupNorm(32, out_channels),
nn.ReLU(),
)
)
rate1, rate2, rate3 = tuple(atrous_rates)
modules.append(ASPPConv(in_channels, out_channels, rate1))
modules.append(ASPPConv(in_channels, out_channels, rate2))
modules.append(ASPPConv(in_channels, out_channels, rate3))
modules.append(ASPPPooling(in_channels, out_channels))
self.convs = nn.ModuleList(modules)
self.project = nn.Sequential(
nn.Conv2d(5 * out_channels, out_channels, 1, bias=False),
# nn.BatchNorm2d(out_channels),
nn.ReLU()
# nn.Dropout(0.5)
)
def forward(self, x):
res = []
for conv in self.convs:
res.append(conv(x))
res = torch.cat(res, dim=1)
return self.project(res)
# copied from
# https://github.com/AlexHex7/Non-local_pytorch/blob/master/lib/non_local_embedded_gaussian.py
# See https://arxiv.org/abs/1711.07971 for details
class _NonLocalBlockND(nn.Module):
def __init__(
self, in_channels, inter_channels=None, dimension=3, sub_sample=True, bn_layer=True
):
super(_NonLocalBlockND, self).__init__()
assert dimension in [1, 2, 3]
self.dimension = dimension
self.sub_sample = sub_sample
self.in_channels = in_channels
self.inter_channels = inter_channels
if self.inter_channels is None:
self.inter_channels = in_channels // 2
if self.inter_channels == 0:
self.inter_channels = 1
if dimension == 3:
conv_nd = nn.Conv3d
max_pool_layer = nn.MaxPool3d(kernel_size=(1, 2, 2))
bn = nn.GroupNorm # (32, hidden_dim) #nn.BatchNorm3d
elif dimension == 2:
conv_nd = nn.Conv2d
max_pool_layer = nn.MaxPool2d(kernel_size=(2, 2))
bn = nn.GroupNorm # (32, hidden_dim)nn.BatchNorm2d
else:
conv_nd = nn.Conv1d
max_pool_layer = nn.MaxPool1d(kernel_size=2)
bn = nn.GroupNorm # (32, hidden_dim)nn.BatchNorm1d
self.g = conv_nd(
in_channels=self.in_channels,
out_channels=self.inter_channels,
kernel_size=1,
stride=1,
padding=0,
)
if bn_layer:
self.W = nn.Sequential(
conv_nd(
in_channels=self.inter_channels,
out_channels=self.in_channels,
kernel_size=1,
stride=1,
padding=0,
),
bn(32, self.in_channels),
)
nn.init.constant_(self.W[1].weight, 0)
nn.init.constant_(self.W[1].bias, 0)
else:
self.W = conv_nd(
in_channels=self.inter_channels,
out_channels=self.in_channels,
kernel_size=1,
stride=1,
padding=0,
)
nn.init.constant_(self.W.weight, 0)
nn.init.constant_(self.W.bias, 0)
self.theta = conv_nd(
in_channels=self.in_channels,
out_channels=self.inter_channels,
kernel_size=1,
stride=1,
padding=0,
)
self.phi = conv_nd(
in_channels=self.in_channels,
out_channels=self.inter_channels,
kernel_size=1,
stride=1,
padding=0,
)
if sub_sample:
self.g = nn.Sequential(self.g, max_pool_layer)
self.phi = nn.Sequential(self.phi, max_pool_layer)
def forward(self, x):
"""
:param x: (b, c, t, h, w)
:return:
"""
batch_size = x.size(0)
g_x = self.g(x).view(batch_size, self.inter_channels, -1)
g_x = g_x.permute(0, 2, 1)
theta_x = self.theta(x).view(batch_size, self.inter_channels, -1)
theta_x = theta_x.permute(0, 2, 1)
phi_x = self.phi(x).view(batch_size, self.inter_channels, -1)
f = torch.matmul(theta_x, phi_x)
f_div_C = F.softmax(f, dim=-1)
y = torch.matmul(f_div_C, g_x)
y = y.permute(0, 2, 1).contiguous()
y = y.view(batch_size, self.inter_channels, *x.size()[2:])
W_y = self.W(y)
z = W_y + x
return z
class NONLocalBlock2D(_NonLocalBlockND):
def __init__(self, in_channels, inter_channels=None, sub_sample=True, bn_layer=True):
super(NONLocalBlock2D, self).__init__(
in_channels,
inter_channels=inter_channels,
dimension=2,
sub_sample=sub_sample,
bn_layer=bn_layer,
)

View File

@ -0,0 +1,5 @@
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
from detectron2.utils.registry import Registry
ROI_DENSEPOSE_HEAD_REGISTRY = Registry("ROI_DENSEPOSE_HEAD")

View File

@ -0,0 +1,224 @@
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
import numpy as np
from typing import Dict, List, Optional
import fvcore.nn.weight_init as weight_init
import torch
import torch.nn as nn
from torch.nn import functional as F
from detectron2.layers import Conv2d, ShapeSpec, get_norm
from detectron2.modeling import ROI_HEADS_REGISTRY, StandardROIHeads
from detectron2.modeling.poolers import ROIPooler
from detectron2.modeling.roi_heads import select_foreground_proposals
from detectron2.structures import ImageList, Instances
from .. import (
build_densepose_data_filter,
build_densepose_head,
build_densepose_losses,
build_densepose_predictor,
densepose_inference,
)
class Decoder(nn.Module):
"""
A semantic segmentation head described in detail in the Panoptic Feature Pyramid Networks paper
(https://arxiv.org/abs/1901.02446). It takes FPN features as input and merges information from
all levels of the FPN into single output.
"""
def __init__(self, cfg, input_shape: Dict[str, ShapeSpec], in_features):
super(Decoder, self).__init__()
# fmt: off
self.in_features = in_features
feature_strides = {k: v.stride for k, v in input_shape.items()}
feature_channels = {k: v.channels for k, v in input_shape.items()}
num_classes = cfg.MODEL.ROI_DENSEPOSE_HEAD.DECODER_NUM_CLASSES
conv_dims = cfg.MODEL.ROI_DENSEPOSE_HEAD.DECODER_CONV_DIMS
self.common_stride = cfg.MODEL.ROI_DENSEPOSE_HEAD.DECODER_COMMON_STRIDE
norm = cfg.MODEL.ROI_DENSEPOSE_HEAD.DECODER_NORM
# fmt: on
self.scale_heads = []
for in_feature in self.in_features:
head_ops = []
head_length = max(
1, int(np.log2(feature_strides[in_feature]) - np.log2(self.common_stride))
)
for k in range(head_length):
conv = Conv2d(
feature_channels[in_feature] if k == 0 else conv_dims,
conv_dims,
kernel_size=3,
stride=1,
padding=1,
bias=not norm,
norm=get_norm(norm, conv_dims),
activation=F.relu,
)
weight_init.c2_msra_fill(conv)
head_ops.append(conv)
if feature_strides[in_feature] != self.common_stride:
head_ops.append(
nn.Upsample(scale_factor=2, mode="bilinear", align_corners=False)
)
self.scale_heads.append(nn.Sequential(*head_ops))
self.add_module(in_feature, self.scale_heads[-1])
self.predictor = Conv2d(conv_dims, num_classes, kernel_size=1, stride=1, padding=0)
weight_init.c2_msra_fill(self.predictor)
def forward(self, features: List[torch.Tensor]):
for i, _ in enumerate(self.in_features):
if i == 0:
x = self.scale_heads[i](features[i])
else:
x = x + self.scale_heads[i](features[i])
x = self.predictor(x)
return x
@ROI_HEADS_REGISTRY.register()
class DensePoseROIHeads(StandardROIHeads):
"""
A Standard ROIHeads which contains an addition of DensePose head.
"""
def __init__(self, cfg, input_shape):
super().__init__(cfg, input_shape)
self._init_densepose_head(cfg, input_shape)
def _init_densepose_head(self, cfg, input_shape):
# fmt: off
self.densepose_on = cfg.MODEL.DENSEPOSE_ON
if not self.densepose_on:
return
self.densepose_data_filter = build_densepose_data_filter(cfg)
dp_pooler_resolution = cfg.MODEL.ROI_DENSEPOSE_HEAD.POOLER_RESOLUTION
dp_pooler_sampling_ratio = cfg.MODEL.ROI_DENSEPOSE_HEAD.POOLER_SAMPLING_RATIO
dp_pooler_type = cfg.MODEL.ROI_DENSEPOSE_HEAD.POOLER_TYPE
self.use_decoder = cfg.MODEL.ROI_DENSEPOSE_HEAD.DECODER_ON
# fmt: on
if self.use_decoder:
dp_pooler_scales = (1.0 / input_shape[self.in_features[0]].stride,)
else:
dp_pooler_scales = tuple(1.0 / input_shape[k].stride for k in self.in_features)
in_channels = [input_shape[f].channels for f in self.in_features][0]
if self.use_decoder:
self.decoder = Decoder(cfg, input_shape, self.in_features)
self.densepose_pooler = ROIPooler(
output_size=dp_pooler_resolution,
scales=dp_pooler_scales,
sampling_ratio=dp_pooler_sampling_ratio,
pooler_type=dp_pooler_type,
)
self.densepose_head = build_densepose_head(cfg, in_channels)
self.densepose_predictor = build_densepose_predictor(
cfg, self.densepose_head.n_out_channels
)
self.densepose_losses = build_densepose_losses(cfg)
def _forward_densepose(self, features: Dict[str, torch.Tensor], instances: List[Instances]):
"""
Forward logic of the densepose prediction branch.
Args:
features (dict[str, Tensor]): input data as a mapping from feature
map name to tensor. Axis 0 represents the number of images `N` in
the input data; axes 1-3 are channels, height, and width, which may
vary between feature maps (e.g., if a feature pyramid is used).
instances (list[Instances]): length `N` list of `Instances`. The i-th
`Instances` contains instances for the i-th input image,
In training, they can be the proposals.
In inference, they can be the predicted boxes.
Returns:
In training, a dict of losses.
In inference, update `instances` with new fields "densepose" and return it.
"""
if not self.densepose_on:
return {} if self.training else instances
features = [features[f] for f in self.in_features]
if self.training:
proposals, _ = select_foreground_proposals(instances, self.num_classes)
features, proposals = self.densepose_data_filter(features, proposals)
if len(proposals) > 0:
proposal_boxes = [x.proposal_boxes for x in proposals]
if self.use_decoder:
features = [self.decoder(features)]
features_dp = self.densepose_pooler(features, proposal_boxes)
densepose_head_outputs = self.densepose_head(features_dp)
densepose_outputs, _, confidences, _ = self.densepose_predictor(
densepose_head_outputs
)
densepose_loss_dict = self.densepose_losses(
proposals, densepose_outputs, confidences
)
return densepose_loss_dict
else:
pred_boxes = [x.pred_boxes for x in instances]
if self.use_decoder:
features = [self.decoder(features)]
features_dp = self.densepose_pooler(features, pred_boxes)
if len(features_dp) > 0:
densepose_head_outputs = self.densepose_head(features_dp)
densepose_outputs, _, confidences, _ = self.densepose_predictor(
densepose_head_outputs
)
else:
# If no detection occurred instances
# set densepose_outputs to empty tensors
empty_tensor = torch.zeros(size=(0, 0, 0, 0), device=features_dp.device)
densepose_outputs = tuple([empty_tensor] * 4)
confidences = tuple([empty_tensor] * 6)
densepose_inference(densepose_outputs, confidences, instances)
return instances
def forward(
self,
images: ImageList,
features: Dict[str, torch.Tensor],
proposals: List[Instances],
targets: Optional[List[Instances]] = None,
):
instances, losses = super().forward(images, features, proposals, targets)
del targets, images
if self.training:
losses.update(self._forward_densepose(features, instances))
return instances, losses
def forward_with_given_boxes(
self, features: Dict[str, torch.Tensor], instances: List[Instances]
):
"""
Use the given boxes in `instances` to produce other (non-box) per-ROI outputs.
This is useful for downstream tasks where a box is known, but need to obtain
other attributes (outputs of other heads).
Test-time augmentation also uses this.
Args:
features: same as in `forward()`
instances (list[Instances]): instances to predict other outputs. Expect the keys
"pred_boxes" and "pred_classes" to exist.
Returns:
instances (list[Instances]):
the same `Instances` objects, with extra
fields such as `pred_masks` or `pred_keypoints`.
"""
instances = super().forward_with_given_boxes(features, instances)
instances = self._forward_densepose(features, instances)
return instances

View File

@ -0,0 +1,64 @@
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
import torch
from torch import nn
from torch.nn import functional as F
from detectron2.config import CfgNode
from detectron2.layers import Conv2d
from ..utils import initialize_module_params
from .registry import ROI_DENSEPOSE_HEAD_REGISTRY
@ROI_DENSEPOSE_HEAD_REGISTRY.register()
class DensePoseV1ConvXHead(nn.Module):
"""
Fully convolutional DensePose head.
"""
def __init__(self, cfg: CfgNode, input_channels: int):
"""
Initialize DensePose fully convolutional head
Args:
cfg (CfgNode): configuration options
input_channels (int): number of input channels
"""
super(DensePoseV1ConvXHead, self).__init__()
# fmt: off
hidden_dim = cfg.MODEL.ROI_DENSEPOSE_HEAD.CONV_HEAD_DIM
kernel_size = cfg.MODEL.ROI_DENSEPOSE_HEAD.CONV_HEAD_KERNEL
self.n_stacked_convs = cfg.MODEL.ROI_DENSEPOSE_HEAD.NUM_STACKED_CONVS
# fmt: on
pad_size = kernel_size // 2
n_channels = input_channels
for i in range(self.n_stacked_convs):
layer = Conv2d(n_channels, hidden_dim, kernel_size, stride=1, padding=pad_size)
layer_name = self._get_layer_name(i)
self.add_module(layer_name, layer)
n_channels = hidden_dim
self.n_out_channels = n_channels
initialize_module_params(self)
def forward(self, features: torch.Tensor):
"""
Apply DensePose fully convolutional head to the input features
Args:
features (tensor): input features
Result:
A tensor of DensePose head outputs
"""
x = features
output = x
for i in range(self.n_stacked_convs):
layer_name = self._get_layer_name(i)
x = getattr(self, layer_name)(x)
x = F.relu(x)
output = x
return output
def _get_layer_name(self, i: int):
layer_name = "body_conv_fcn{}".format(i + 1)
return layer_name

View File

@ -0,0 +1,250 @@
#!/usr/bin/env python3
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
import argparse
import logging
import os
import sys
from timeit import default_timer as timer
from typing import Any, ClassVar, Dict, List
import torch
from fvcore.common.file_io import PathManager
from detectron2.data.catalog import DatasetCatalog
from detectron2.utils.logger import setup_logger
from densepose.data.structures import DensePoseDataRelative
from densepose.utils.dbhelper import EntrySelector
from densepose.utils.logger import verbosity_to_level
from densepose.vis.base import CompoundVisualizer
from densepose.vis.bounding_box import BoundingBoxVisualizer
from densepose.vis.densepose import (
DensePoseDataCoarseSegmentationVisualizer,
DensePoseDataPointsIVisualizer,
DensePoseDataPointsUVisualizer,
DensePoseDataPointsVisualizer,
DensePoseDataPointsVVisualizer,
)
DOC = """Query DB - a tool to print / visualize data from a database
"""
LOGGER_NAME = "query_db"
logger = logging.getLogger(LOGGER_NAME)
_ACTION_REGISTRY: Dict[str, "Action"] = {}
class Action(object):
@classmethod
def add_arguments(cls: type, parser: argparse.ArgumentParser):
parser.add_argument(
"-v",
"--verbosity",
action="count",
help="Verbose mode. Multiple -v options increase the verbosity.",
)
def register_action(cls: type):
"""
Decorator for action classes to automate action registration
"""
global _ACTION_REGISTRY
_ACTION_REGISTRY[cls.COMMAND] = cls
return cls
class EntrywiseAction(Action):
@classmethod
def add_arguments(cls: type, parser: argparse.ArgumentParser):
super(EntrywiseAction, cls).add_arguments(parser)
parser.add_argument(
"dataset", metavar="<dataset>", help="Dataset name (e.g. densepose_coco_2014_train)"
)
parser.add_argument(
"selector",
metavar="<selector>",
help="Dataset entry selector in the form field1[:type]=value1[,"
"field2[:type]=value_min-value_max...] which selects all "
"entries from the dataset that satisfy the constraints",
)
parser.add_argument(
"--max-entries", metavar="N", help="Maximum number of entries to process", type=int
)
@classmethod
def execute(cls: type, args: argparse.Namespace):
dataset = setup_dataset(args.dataset)
entry_selector = EntrySelector.from_string(args.selector)
context = cls.create_context(args)
if args.max_entries is not None:
for _, entry in zip(range(args.max_entries), dataset):
if entry_selector(entry):
cls.execute_on_entry(entry, context)
else:
for entry in dataset:
if entry_selector(entry):
cls.execute_on_entry(entry, context)
@classmethod
def create_context(cls: type, args: argparse.Namespace) -> Dict[str, Any]:
context = {}
return context
@register_action
class PrintAction(EntrywiseAction):
"""
Print action that outputs selected entries to stdout
"""
COMMAND: ClassVar[str] = "print"
@classmethod
def add_parser(cls: type, subparsers: argparse._SubParsersAction):
parser = subparsers.add_parser(cls.COMMAND, help="Output selected entries to stdout. ")
cls.add_arguments(parser)
parser.set_defaults(func=cls.execute)
@classmethod
def add_arguments(cls: type, parser: argparse.ArgumentParser):
super(PrintAction, cls).add_arguments(parser)
@classmethod
def execute_on_entry(cls: type, entry: Dict[str, Any], context: Dict[str, Any]):
import pprint
printer = pprint.PrettyPrinter(indent=2, width=200, compact=True)
printer.pprint(entry)
@register_action
class ShowAction(EntrywiseAction):
"""
Show action that visualizes selected entries on an image
"""
COMMAND: ClassVar[str] = "show"
VISUALIZERS: ClassVar[Dict[str, object]] = {
"dp_segm": DensePoseDataCoarseSegmentationVisualizer(),
"dp_i": DensePoseDataPointsIVisualizer(),
"dp_u": DensePoseDataPointsUVisualizer(),
"dp_v": DensePoseDataPointsVVisualizer(),
"dp_pts": DensePoseDataPointsVisualizer(),
"bbox": BoundingBoxVisualizer(),
}
@classmethod
def add_parser(cls: type, subparsers: argparse._SubParsersAction):
parser = subparsers.add_parser(cls.COMMAND, help="Visualize selected entries")
cls.add_arguments(parser)
parser.set_defaults(func=cls.execute)
@classmethod
def add_arguments(cls: type, parser: argparse.ArgumentParser):
super(ShowAction, cls).add_arguments(parser)
parser.add_argument(
"visualizations",
metavar="<visualizations>",
help="Comma separated list of visualizations, possible values: "
"[{}]".format(",".join(sorted(cls.VISUALIZERS.keys()))),
)
parser.add_argument(
"--output",
metavar="<image_file>",
default="output.png",
help="File name to save output to",
)
@classmethod
def execute_on_entry(cls: type, entry: Dict[str, Any], context: Dict[str, Any]):
import cv2
import numpy as np
image_fpath = PathManager.get_local_path(entry["file_name"])
image = cv2.imread(image_fpath, cv2.IMREAD_GRAYSCALE)
image = np.tile(image[:, :, np.newaxis], [1, 1, 3])
datas = cls._extract_data_for_visualizers_from_entry(context["vis_specs"], entry)
visualizer = context["visualizer"]
image_vis = visualizer.visualize(image, datas)
entry_idx = context["entry_idx"] + 1
out_fname = cls._get_out_fname(entry_idx, context["out_fname"])
cv2.imwrite(out_fname, image_vis)
logger.info(f"Output saved to {out_fname}")
context["entry_idx"] += 1
@classmethod
def _get_out_fname(cls: type, entry_idx: int, fname_base: str):
base, ext = os.path.splitext(fname_base)
return base + ".{0:04d}".format(entry_idx) + ext
@classmethod
def create_context(cls: type, args: argparse.Namespace) -> Dict[str, Any]:
vis_specs = args.visualizations.split(",")
visualizers = []
for vis_spec in vis_specs:
vis = cls.VISUALIZERS[vis_spec]
visualizers.append(vis)
context = {
"vis_specs": vis_specs,
"visualizer": CompoundVisualizer(visualizers),
"out_fname": args.output,
"entry_idx": 0,
}
return context
@classmethod
def _extract_data_for_visualizers_from_entry(
cls: type, vis_specs: List[str], entry: Dict[str, Any]
):
dp_list = []
bbox_list = []
for annotation in entry["annotations"]:
is_valid, _ = DensePoseDataRelative.validate_annotation(annotation)
if not is_valid:
continue
bbox = torch.as_tensor(annotation["bbox"])
bbox_list.append(bbox)
dp_data = DensePoseDataRelative(annotation)
dp_list.append(dp_data)
datas = []
for vis_spec in vis_specs:
datas.append(bbox_list if "bbox" == vis_spec else (bbox_list, dp_list))
return datas
def setup_dataset(dataset_name):
logger.info("Loading dataset {}".format(dataset_name))
start = timer()
dataset = DatasetCatalog.get(dataset_name)
stop = timer()
logger.info("Loaded dataset {} in {:.3f}s".format(dataset_name, stop - start))
return dataset
def create_argument_parser() -> argparse.ArgumentParser:
parser = argparse.ArgumentParser(
description=DOC,
formatter_class=lambda prog: argparse.HelpFormatter(prog, max_help_position=120),
)
parser.set_defaults(func=lambda _: parser.print_help(sys.stdout))
subparsers = parser.add_subparsers(title="Actions")
for _, action in _ACTION_REGISTRY.items():
action.add_parser(subparsers)
return parser
def main():
parser = create_argument_parser()
args = parser.parse_args()
verbosity = args.verbosity if hasattr(args, "verbosity") else None
global logger
logger = setup_logger(name=LOGGER_NAME)
logger.setLevel(verbosity_to_level(verbosity))
args.func(args)
if __name__ == "__main__":
main()

View File

@ -0,0 +1,74 @@
#!/usr/bin/env python3
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
"""
DensePose Training Script.
This script is similar to the training script in detectron2/tools.
It is an example of how a user might use detectron2 for a new project.
"""
from fvcore.common.file_io import PathManager
import detectron2.utils.comm as comm
from detectron2.config import get_cfg
from detectron2.engine import default_argument_parser, default_setup, hooks, launch
from detectron2.evaluation import verify_results
from detectron2.utils.logger import setup_logger
from densepose import add_densepose_config
from densepose.engine import Trainer
from densepose.modeling.densepose_checkpoint import DensePoseCheckpointer
def setup(args):
cfg = get_cfg()
add_densepose_config(cfg)
cfg.merge_from_file(args.config_file)
cfg.merge_from_list(args.opts)
cfg.freeze()
default_setup(cfg, args)
# Setup logger for "densepose" module
setup_logger(output=cfg.OUTPUT_DIR, distributed_rank=comm.get_rank(), name="densepose")
return cfg
def main(args):
cfg = setup(args)
# disable strict kwargs checking: allow one to specify path handle
# hints through kwargs, like timeout in DP evaluation
PathManager.set_strict_kwargs_checking(False)
if args.eval_only:
model = Trainer.build_model(cfg)
DensePoseCheckpointer(model, save_dir=cfg.OUTPUT_DIR).resume_or_load(
cfg.MODEL.WEIGHTS, resume=args.resume
)
res = Trainer.test(cfg, model)
if cfg.TEST.AUG.ENABLED:
res.update(Trainer.test_with_TTA(cfg, model))
if comm.is_main_process():
verify_results(cfg, res)
return res
trainer = Trainer(cfg)
trainer.resume_or_load(resume=args.resume)
if cfg.TEST.AUG.ENABLED:
trainer.register_hooks(
[hooks.EvalHook(0, lambda: trainer.test_with_TTA(cfg, trainer.model))]
)
return trainer.train()
if __name__ == "__main__":
args = default_argument_parser().parse_args()
print("Command Line Args:", args)
launch(
main,
args.num_gpus,
num_machines=args.num_machines,
machine_rank=args.machine_rank,
dist_url=args.dist_url,
args=(args,),
)