Add at new repo again

This commit is contained in:
2025-01-28 21:48:35 +00:00
commit 6e660ddb3c
564 changed files with 75575 additions and 0 deletions

View File

@@ -0,0 +1,54 @@
# DensePose in Detectron2
**Dense Human Pose Estimation In The Wild**
_Rıza Alp Güler, Natalia Neverova, Iasonas Kokkinos_
[[`densepose.org`](https://densepose.org)] [[`arXiv`](https://arxiv.org/abs/1802.00434)] [[`BibTeX`](#CitingDensePose)]
Dense human pose estimation aims at mapping all human pixels of an RGB image to the 3D surface of the human body.
<div align="center">
<img src="https://drive.google.com/uc?export=view&id=1qfSOkpueo1kVZbXOuQJJhyagKjMgepsz" width="700px" />
</div>
In this repository, we provide the code to train and evaluate DensePose-RCNN. We also provide tools to visualize
DensePose annotation and results.
# Quick Start
See [ Getting Started ](doc/GETTING_STARTED.md)
# Model Zoo and Baselines
We provide a number of baseline results and trained models available for download. See [Model Zoo](doc/MODEL_ZOO.md) for details.
# License
Detectron2 is released under the [Apache 2.0 license](../../LICENSE)
## <a name="CitingDensePose"></a>Citing DensePose
If you use DensePose, please take the references from the following BibTeX entries:
For DensePose with estimated confidences:
```
@InProceedings{Neverova2019DensePoseConfidences,
title = {Correlated Uncertainty for Learning Dense Correspondences from Noisy Labels},
author = {Neverova, Natalia and Novotny, David and Vedaldi, Andrea},
journal = {Advances in Neural Information Processing Systems},
year = {2019},
}
```
For the original DensePose:
```
@InProceedings{Guler2018DensePose,
title={DensePose: Dense Human Pose Estimation In The Wild},
author={R\{i}za Alp G\"uler, Natalia Neverova, Iasonas Kokkinos},
journal={The IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
year={2018}
}
```

View File

@@ -0,0 +1,318 @@
#!/usr/bin/env python3
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
import argparse
import glob
import logging
import os
import pickle
import sys
from typing import Any, ClassVar, Dict, List
import torch
from detectron2.config import get_cfg
from detectron2.data.detection_utils import read_image
from detectron2.engine.defaults import DefaultPredictor
from detectron2.structures.boxes import BoxMode
from detectron2.structures.instances import Instances
from detectron2.utils.logger import setup_logger
from densepose import add_densepose_config
from densepose.utils.logger import verbosity_to_level
from densepose.vis.base import CompoundVisualizer
from densepose.vis.bounding_box import ScoredBoundingBoxVisualizer
from densepose.vis.densepose import (
DensePoseResultsContourVisualizer,
DensePoseResultsFineSegmentationVisualizer,
DensePoseResultsUVisualizer,
DensePoseResultsVVisualizer,
)
from densepose.vis.extractor import CompoundExtractor, create_extractor
DOC = """Apply Net - a tool to print / visualize DensePose results
"""
LOGGER_NAME = "apply_net"
logger = logging.getLogger(LOGGER_NAME)
_ACTION_REGISTRY: Dict[str, "Action"] = {}
class Action(object):
@classmethod
def add_arguments(cls: type, parser: argparse.ArgumentParser):
parser.add_argument(
"-v",
"--verbosity",
action="count",
help="Verbose mode. Multiple -v options increase the verbosity.",
)
def register_action(cls: type):
"""
Decorator for action classes to automate action registration
"""
global _ACTION_REGISTRY
_ACTION_REGISTRY[cls.COMMAND] = cls
return cls
class InferenceAction(Action):
@classmethod
def add_arguments(cls: type, parser: argparse.ArgumentParser):
super(InferenceAction, cls).add_arguments(parser)
parser.add_argument("cfg", metavar="<config>", help="Config file")
parser.add_argument("model", metavar="<model>", help="Model file")
parser.add_argument("input", metavar="<input>", help="Input data")
parser.add_argument(
"--opts",
help="Modify config options using the command-line 'KEY VALUE' pairs",
default=[],
nargs=argparse.REMAINDER,
)
@classmethod
def execute(cls: type, args: argparse.Namespace):
logger.info(f"Loading config from {args.cfg}")
opts = []
cfg = cls.setup_config(args.cfg, args.model, args, opts)
logger.info(f"Loading model from {args.model}")
predictor = DefaultPredictor(cfg)
logger.info(f"Loading data from {args.input}")
file_list = cls._get_input_file_list(args.input)
if len(file_list) == 0:
logger.warning(f"No input images for {args.input}")
return
context = cls.create_context(args)
for file_name in file_list:
img = read_image(file_name, format="BGR") # predictor expects BGR image.
with torch.no_grad():
outputs = predictor(img)["instances"]
cls.execute_on_outputs(context, {"file_name": file_name, "image": img}, outputs)
cls.postexecute(context)
@classmethod
def setup_config(
cls: type, config_fpath: str, model_fpath: str, args: argparse.Namespace, opts: List[str]
):
cfg = get_cfg()
add_densepose_config(cfg)
cfg.merge_from_file(config_fpath)
cfg.merge_from_list(args.opts)
if opts:
cfg.merge_from_list(opts)
cfg.MODEL.WEIGHTS = model_fpath
cfg.freeze()
return cfg
@classmethod
def _get_input_file_list(cls: type, input_spec: str):
if os.path.isdir(input_spec):
file_list = [
os.path.join(input_spec, fname)
for fname in os.listdir(input_spec)
if os.path.isfile(os.path.join(input_spec, fname))
]
elif os.path.isfile(input_spec):
file_list = [input_spec]
else:
file_list = glob.glob(input_spec)
return file_list
@register_action
class DumpAction(InferenceAction):
"""
Dump action that outputs results to a pickle file
"""
COMMAND: ClassVar[str] = "dump"
@classmethod
def add_parser(cls: type, subparsers: argparse._SubParsersAction):
parser = subparsers.add_parser(cls.COMMAND, help="Dump model outputs to a file.")
cls.add_arguments(parser)
parser.set_defaults(func=cls.execute)
@classmethod
def add_arguments(cls: type, parser: argparse.ArgumentParser):
super(DumpAction, cls).add_arguments(parser)
parser.add_argument(
"--output",
metavar="<dump_file>",
default="results.pkl",
help="File name to save dump to",
)
@classmethod
def execute_on_outputs(
cls: type, context: Dict[str, Any], entry: Dict[str, Any], outputs: Instances
):
image_fpath = entry["file_name"]
logger.info(f"Processing {image_fpath}")
result = {"file_name": image_fpath}
if outputs.has("scores"):
result["scores"] = outputs.get("scores").cpu()
if outputs.has("pred_boxes"):
result["pred_boxes_XYXY"] = outputs.get("pred_boxes").tensor.cpu()
if outputs.has("pred_densepose"):
boxes_XYWH = BoxMode.convert(
result["pred_boxes_XYXY"], BoxMode.XYXY_ABS, BoxMode.XYWH_ABS
)
result["pred_densepose"] = outputs.get("pred_densepose").to_result(boxes_XYWH)
context["results"].append(result)
@classmethod
def create_context(cls: type, args: argparse.Namespace):
context = {"results": [], "out_fname": args.output}
return context
@classmethod
def postexecute(cls: type, context: Dict[str, Any]):
out_fname = context["out_fname"]
out_dir = os.path.dirname(out_fname)
if len(out_dir) > 0 and not os.path.exists(out_dir):
os.makedirs(out_dir)
with open(out_fname, "wb") as hFile:
pickle.dump(context["results"], hFile)
logger.info(f"Output saved to {out_fname}")
@register_action
class ShowAction(InferenceAction):
"""
Show action that visualizes selected entries on an image
"""
COMMAND: ClassVar[str] = "show"
VISUALIZERS: ClassVar[Dict[str, object]] = {
"dp_contour": DensePoseResultsContourVisualizer,
"dp_segm": DensePoseResultsFineSegmentationVisualizer,
"dp_u": DensePoseResultsUVisualizer,
"dp_v": DensePoseResultsVVisualizer,
"bbox": ScoredBoundingBoxVisualizer,
}
@classmethod
def add_parser(cls: type, subparsers: argparse._SubParsersAction):
parser = subparsers.add_parser(cls.COMMAND, help="Visualize selected entries")
cls.add_arguments(parser)
parser.set_defaults(func=cls.execute)
@classmethod
def add_arguments(cls: type, parser: argparse.ArgumentParser):
super(ShowAction, cls).add_arguments(parser)
parser.add_argument(
"visualizations",
metavar="<visualizations>",
help="Comma separated list of visualizations, possible values: "
"[{}]".format(",".join(sorted(cls.VISUALIZERS.keys()))),
)
parser.add_argument(
"--min_score",
metavar="<score>",
default=0.8,
type=float,
help="Minimum detection score to visualize",
)
parser.add_argument(
"--nms_thresh", metavar="<threshold>", default=None, type=float, help="NMS threshold"
)
parser.add_argument(
"--output",
metavar="<image_file>",
default="outputres.png",
help="File name to save output to",
)
@classmethod
def setup_config(
cls: type, config_fpath: str, model_fpath: str, args: argparse.Namespace, opts: List[str]
):
opts.append("MODEL.ROI_HEADS.SCORE_THRESH_TEST")
opts.append(str(args.min_score))
if args.nms_thresh is not None:
opts.append("MODEL.ROI_HEADS.NMS_THRESH_TEST")
opts.append(str(args.nms_thresh))
cfg = super(ShowAction, cls).setup_config(config_fpath, model_fpath, args, opts)
return cfg
@classmethod
def execute_on_outputs(
cls: type, context: Dict[str, Any], entry: Dict[str, Any], outputs: Instances
):
import cv2
import numpy as np
visualizer = context["visualizer"]
extractor = context["extractor"]
image_fpath = entry["file_name"]
logger.info(f"Processing {image_fpath}")
image = cv2.cvtColor(entry["image"], cv2.COLOR_BGR2GRAY)
image = np.tile(image[:, :, np.newaxis], [1, 1, 3])
data = extractor(outputs)
image_vis = visualizer.visualize(image, data)
entry_idx = context["entry_idx"] + 1
out_fname = cls._get_out_fname(entry_idx, context["out_fname"])
out_dir = os.path.dirname(out_fname)
if len(out_dir) > 0 and not os.path.exists(out_dir):
os.makedirs(out_dir)
cv2.imwrite(out_fname, image_vis)
logger.info(f"Output saved to {out_fname}")
context["entry_idx"] += 1
@classmethod
def postexecute(cls: type, context: Dict[str, Any]):
pass
@classmethod
def _get_out_fname(cls: type, entry_idx: int, fname_base: str):
base, ext = os.path.splitext(fname_base)
return base + ".{0:04d}".format(entry_idx) + ext
@classmethod
def create_context(cls: type, args: argparse.Namespace) -> Dict[str, Any]:
vis_specs = args.visualizations.split(",")
visualizers = []
extractors = []
for vis_spec in vis_specs:
vis = cls.VISUALIZERS[vis_spec]()
visualizers.append(vis)
extractor = create_extractor(vis)
extractors.append(extractor)
visualizer = CompoundVisualizer(visualizers)
extractor = CompoundExtractor(extractors)
context = {
"extractor": extractor,
"visualizer": visualizer,
"out_fname": args.output,
"entry_idx": 0,
}
return context
def create_argument_parser() -> argparse.ArgumentParser:
parser = argparse.ArgumentParser(
description=DOC,
formatter_class=lambda prog: argparse.HelpFormatter(prog, max_help_position=120),
)
parser.set_defaults(func=lambda _: parser.print_help(sys.stdout))
subparsers = parser.add_subparsers(title="Actions")
for _, action in _ACTION_REGISTRY.items():
action.add_parser(subparsers)
return parser
def main():
parser = create_argument_parser()
args = parser.parse_args()
verbosity = args.verbosity if hasattr(args, "verbosity") else None
global logger
logger = setup_logger(name=LOGGER_NAME)
logger.setLevel(verbosity_to_level(verbosity))
args.func(args)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,47 @@
MODEL:
META_ARCHITECTURE: "GeneralizedRCNN"
BACKBONE:
NAME: "build_resnet_fpn_backbone"
RESNETS:
OUT_FEATURES: ["res2", "res3", "res4", "res5"]
FPN:
IN_FEATURES: ["res2", "res3", "res4", "res5"]
ANCHOR_GENERATOR:
SIZES: [[32], [64], [128], [256], [512]] # One size for each in feature map
ASPECT_RATIOS: [[0.5, 1.0, 2.0]] # Three aspect ratios (same for all in feature maps)
RPN:
IN_FEATURES: ["p2", "p3", "p4", "p5", "p6"]
PRE_NMS_TOPK_TRAIN: 2000 # Per FPN level
PRE_NMS_TOPK_TEST: 1000 # Per FPN level
# Detectron1 uses 2000 proposals per-batch,
# (See "modeling/rpn/rpn_outputs.py" for details of this legacy issue)
# which is approximately 1000 proposals per-image since the default batch size for FPN is 2.
POST_NMS_TOPK_TRAIN: 1000
POST_NMS_TOPK_TEST: 1000
DENSEPOSE_ON: True
ROI_HEADS:
NAME: "DensePoseROIHeads"
IN_FEATURES: ["p2", "p3", "p4", "p5"]
NUM_CLASSES: 1
ROI_BOX_HEAD:
NAME: "FastRCNNConvFCHead"
NUM_FC: 2
POOLER_RESOLUTION: 7
POOLER_SAMPLING_RATIO: 2
POOLER_TYPE: "ROIAlign"
ROI_DENSEPOSE_HEAD:
NAME: "DensePoseV1ConvXHead"
POOLER_TYPE: "ROIAlign"
NUM_COARSE_SEGM_CHANNELS: 2
DATASETS:
TRAIN: ("densepose_coco_2014_train", "densepose_coco_2014_valminusminival")
TEST: ("densepose_coco_2014_minival",)
SOLVER:
IMS_PER_BATCH: 16
BASE_LR: 0.01
STEPS: (60000, 80000)
MAX_ITER: 90000
WARMUP_FACTOR: 0.1
INPUT:
MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)

View File

@@ -0,0 +1,16 @@
_BASE_: "Base-DensePose-RCNN-FPN.yaml"
MODEL:
WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-101.pkl"
RESNETS:
DEPTH: 101
ROI_DENSEPOSE_HEAD:
NAME: "DensePoseDeepLabHead"
UV_CONFIDENCE:
ENABLED: True
TYPE: "iid_iso"
POINT_REGRESSION_WEIGHTS: 0.0005
SOLVER:
CLIP_GRADIENTS:
ENABLED: True
MAX_ITER: 130000
STEPS: (100000, 120000)

View File

@@ -0,0 +1,16 @@
_BASE_: "Base-DensePose-RCNN-FPN.yaml"
MODEL:
WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-101.pkl"
RESNETS:
DEPTH: 101
ROI_DENSEPOSE_HEAD:
NAME: "DensePoseDeepLabHead"
UV_CONFIDENCE:
ENABLED: True
TYPE: "indep_aniso"
POINT_REGRESSION_WEIGHTS: 0.0005
SOLVER:
CLIP_GRADIENTS:
ENABLED: True
MAX_ITER: 130000
STEPS: (100000, 120000)

View File

@@ -0,0 +1,10 @@
_BASE_: "Base-DensePose-RCNN-FPN.yaml"
MODEL:
WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-101.pkl"
RESNETS:
DEPTH: 101
ROI_DENSEPOSE_HEAD:
NAME: "DensePoseDeepLabHead"
SOLVER:
MAX_ITER: 130000
STEPS: (100000, 120000)

View File

@@ -0,0 +1,16 @@
_BASE_: "Base-DensePose-RCNN-FPN.yaml"
MODEL:
WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-101.pkl"
RESNETS:
DEPTH: 101
ROI_DENSEPOSE_HEAD:
UV_CONFIDENCE:
ENABLED: True
TYPE: "iid_iso"
POINT_REGRESSION_WEIGHTS: 0.0005
SOLVER:
CLIP_GRADIENTS:
ENABLED: True
MAX_ITER: 130000
STEPS: (100000, 120000)
WARMUP_FACTOR: 0.025

View File

@@ -0,0 +1,16 @@
_BASE_: "Base-DensePose-RCNN-FPN.yaml"
MODEL:
WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-101.pkl"
RESNETS:
DEPTH: 101
ROI_DENSEPOSE_HEAD:
UV_CONFIDENCE:
ENABLED: True
TYPE: "indep_aniso"
POINT_REGRESSION_WEIGHTS: 0.0005
SOLVER:
CLIP_GRADIENTS:
ENABLED: True
MAX_ITER: 130000
STEPS: (100000, 120000)
WARMUP_FACTOR: 0.025

View File

@@ -0,0 +1,8 @@
_BASE_: "Base-DensePose-RCNN-FPN.yaml"
MODEL:
WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-101.pkl"
RESNETS:
DEPTH: 101
SOLVER:
MAX_ITER: 130000
STEPS: (100000, 120000)

View File

@@ -0,0 +1,17 @@
_BASE_: "Base-DensePose-RCNN-FPN.yaml"
MODEL:
WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-101.pkl"
RESNETS:
DEPTH: 101
ROI_DENSEPOSE_HEAD:
NUM_COARSE_SEGM_CHANNELS: 15
POOLER_RESOLUTION: 14
HEATMAP_SIZE: 56
INDEX_WEIGHTS: 2.0
PART_WEIGHTS: 0.3
POINT_REGRESSION_WEIGHTS: 0.1
DECODER_ON: False
SOLVER:
BASE_LR: 0.002
MAX_ITER: 130000
STEPS: (100000, 120000)

View File

@@ -0,0 +1,16 @@
_BASE_: "Base-DensePose-RCNN-FPN.yaml"
MODEL:
WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
RESNETS:
DEPTH: 50
ROI_DENSEPOSE_HEAD:
NAME: "DensePoseDeepLabHead"
UV_CONFIDENCE:
ENABLED: True
TYPE: "iid_iso"
POINT_REGRESSION_WEIGHTS: 0.0005
SOLVER:
CLIP_GRADIENTS:
ENABLED: True
MAX_ITER: 130000
STEPS: (100000, 120000)

View File

@@ -0,0 +1,16 @@
_BASE_: "Base-DensePose-RCNN-FPN.yaml"
MODEL:
WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
RESNETS:
DEPTH: 50
ROI_DENSEPOSE_HEAD:
NAME: "DensePoseDeepLabHead"
UV_CONFIDENCE:
ENABLED: True
TYPE: "indep_aniso"
POINT_REGRESSION_WEIGHTS: 0.0005
SOLVER:
CLIP_GRADIENTS:
ENABLED: True
MAX_ITER: 130000
STEPS: (100000, 120000)

View File

@@ -0,0 +1,10 @@
_BASE_: "Base-DensePose-RCNN-FPN.yaml"
MODEL:
WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
RESNETS:
DEPTH: 50
ROI_DENSEPOSE_HEAD:
NAME: "DensePoseDeepLabHead"
SOLVER:
MAX_ITER: 130000
STEPS: (100000, 120000)

View File

@@ -0,0 +1,16 @@
_BASE_: "Base-DensePose-RCNN-FPN.yaml"
MODEL:
WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
RESNETS:
DEPTH: 50
ROI_DENSEPOSE_HEAD:
UV_CONFIDENCE:
ENABLED: True
TYPE: "iid_iso"
POINT_REGRESSION_WEIGHTS: 0.0005
SOLVER:
CLIP_GRADIENTS:
ENABLED: True
MAX_ITER: 130000
STEPS: (100000, 120000)
WARMUP_FACTOR: 0.025

View File

@@ -0,0 +1,16 @@
_BASE_: "Base-DensePose-RCNN-FPN.yaml"
MODEL:
WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
RESNETS:
DEPTH: 50
ROI_DENSEPOSE_HEAD:
UV_CONFIDENCE:
ENABLED: True
TYPE: "indep_aniso"
POINT_REGRESSION_WEIGHTS: 0.0005
SOLVER:
CLIP_GRADIENTS:
ENABLED: True
MAX_ITER: 130000
STEPS: (100000, 120000)
WARMUP_FACTOR: 0.025

View File

@@ -0,0 +1,8 @@
_BASE_: "Base-DensePose-RCNN-FPN.yaml"
MODEL:
WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
RESNETS:
DEPTH: 50
SOLVER:
MAX_ITER: 130000
STEPS: (100000, 120000)

View File

@@ -0,0 +1,17 @@
_BASE_: "Base-DensePose-RCNN-FPN.yaml"
MODEL:
WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
RESNETS:
DEPTH: 50
ROI_DENSEPOSE_HEAD:
NUM_COARSE_SEGM_CHANNELS: 15
POOLER_RESOLUTION: 14
HEATMAP_SIZE: 56
INDEX_WEIGHTS: 2.0
PART_WEIGHTS: 0.3
POINT_REGRESSION_WEIGHTS: 0.1
DECODER_ON: False
SOLVER:
BASE_LR: 0.002
MAX_ITER: 130000
STEPS: (100000, 120000)

View File

@@ -0,0 +1,91 @@
MODEL:
META_ARCHITECTURE: "GeneralizedRCNN"
BACKBONE:
NAME: "build_resnet_fpn_backbone"
RESNETS:
OUT_FEATURES: ["res2", "res3", "res4", "res5"]
FPN:
IN_FEATURES: ["res2", "res3", "res4", "res5"]
ANCHOR_GENERATOR:
SIZES: [[32], [64], [128], [256], [512]] # One size for each in feature map
ASPECT_RATIOS: [[0.5, 1.0, 2.0]] # Three aspect ratios (same for all in feature maps)
RPN:
IN_FEATURES: ["p2", "p3", "p4", "p5", "p6"]
PRE_NMS_TOPK_TRAIN: 2000 # Per FPN level
PRE_NMS_TOPK_TEST: 1000 # Per FPN level
# Detectron1 uses 2000 proposals per-batch,
# (See "modeling/rpn/rpn_outputs.py" for details of this legacy issue)
# which is approximately 1000 proposals per-image since the default batch size for FPN is 2.
POST_NMS_TOPK_TRAIN: 1000
POST_NMS_TOPK_TEST: 1000
ROI_HEADS:
NAME: "StandardROIHeads"
IN_FEATURES: ["p2", "p3", "p4", "p5"]
NUM_CLASSES: 1
ROI_BOX_HEAD:
NAME: "FastRCNNConvFCHead"
NUM_FC: 2
POOLER_RESOLUTION: 7
ROI_MASK_HEAD:
NAME: "MaskRCNNConvUpsampleHead"
NUM_CONV: 4
POOLER_RESOLUTION: 14
DATASETS:
TRAIN: ("base_coco_2017_train",)
TEST: ("base_coco_2017_val", "densepose_chimps")
CATEGORY_MAPS:
"base_coco_2017_train":
"16": 1 # bird -> person
"17": 1 # cat -> person
"18": 1 # dog -> person
"19": 1 # horse -> person
"20": 1 # sheep -> person
"21": 1 # cow -> person
"22": 1 # elephant -> person
"23": 1 # bear -> person
"24": 1 # zebra -> person
"25": 1 # girafe -> person
"base_coco_2017_val":
"16": 1 # bird -> person
"17": 1 # cat -> person
"18": 1 # dog -> person
"19": 1 # horse -> person
"20": 1 # sheep -> person
"21": 1 # cow -> person
"22": 1 # elephant -> person
"23": 1 # bear -> person
"24": 1 # zebra -> person
"25": 1 # girafe -> person
WHITELISTED_CATEGORIES:
"base_coco_2017_train":
- 1 # person
- 16 # bird
- 17 # cat
- 18 # dog
- 19 # horse
- 20 # sheep
- 21 # cow
- 22 # elephant
- 23 # bear
- 24 # zebra
- 25 # girafe
"base_coco_2017_val":
- 1 # person
- 16 # bird
- 17 # cat
- 18 # dog
- 19 # horse
- 20 # sheep
- 21 # cow
- 22 # elephant
- 23 # bear
- 24 # zebra
- 25 # girafe
SOLVER:
IMS_PER_BATCH: 16
BASE_LR: 0.02
STEPS: (60000, 80000)
MAX_ITER: 90000
INPUT:
MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
VERSION: 2

View File

@@ -0,0 +1,7 @@
_BASE_: "Base-RCNN-FPN-MC.yaml"
MODEL:
WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
MASK_ON: False
DENSEPOSE_ON: False
RESNETS:
DEPTH: 50

View File

@@ -0,0 +1,11 @@
_BASE_: "../Base-DensePose-RCNN-FPN.yaml"
MODEL:
WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
ROI_DENSEPOSE_HEAD:
NAME: "DensePoseDeepLabHead"
DATASETS:
TRAIN: ("densepose_coco_2014_minival_100",)
TEST: ("densepose_coco_2014_minival_100",)
SOLVER:
MAX_ITER: 40
STEPS: (30,)

View File

@@ -0,0 +1,13 @@
_BASE_: "../densepose_rcnn_R_50_FPN_s1x.yaml"
MODEL:
WEIGHTS: "https://dl.fbaipublicfiles.com/densepose/densepose_rcnn_R_50_FPN_s1x/165712039/model_final_162be9.pkl"
DATASETS:
TRAIN: ()
TEST: ("densepose_coco_2014_minival_100",)
TEST:
AUG:
ENABLED: True
MIN_SIZES: (400, 500, 600, 700, 800, 900, 1000, 1100, 1200)
MAX_SIZE: 4000
FLIP: True
EXPECTED_RESULTS: [["bbox_TTA", "AP", 61.74, 0.03], ["densepose_gps_TTA", "AP", 60.22, 0.03], ["densepose_gpsm_TTA", "AP", 63.85, 0.03]]

View File

@@ -0,0 +1,19 @@
_BASE_: "../Base-DensePose-RCNN-FPN.yaml"
MODEL:
WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
RESNETS:
DEPTH: 50
ROI_DENSEPOSE_HEAD:
UV_CONFIDENCE:
ENABLED: True
TYPE: "iid_iso"
POINT_REGRESSION_WEIGHTS: 0.0005
DATASETS:
TRAIN: ("densepose_coco_2014_minival_100",)
TEST: ("densepose_coco_2014_minival_100",)
SOLVER:
CLIP_GRADIENTS:
ENABLED: True
MAX_ITER: 40
STEPS: (30,)
WARMUP_FACTOR: 0.025

View File

@@ -0,0 +1,19 @@
_BASE_: "../Base-DensePose-RCNN-FPN.yaml"
MODEL:
WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
RESNETS:
DEPTH: 50
ROI_DENSEPOSE_HEAD:
UV_CONFIDENCE:
ENABLED: True
TYPE: "indep_aniso"
POINT_REGRESSION_WEIGHTS: 0.0005
DATASETS:
TRAIN: ("densepose_coco_2014_minival_100",)
TEST: ("densepose_coco_2014_minival_100",)
SOLVER:
CLIP_GRADIENTS:
ENABLED: True
MAX_ITER: 40
STEPS: (30,)
WARMUP_FACTOR: 0.025

View File

@@ -0,0 +1,8 @@
_BASE_: "../densepose_rcnn_R_50_FPN_s1x.yaml"
MODEL:
WEIGHTS: "https://dl.fbaipublicfiles.com/densepose/densepose_rcnn_R_50_FPN_s1x/165712039/model_final_162be9.pkl"
DATASETS:
TRAIN: ()
TEST: ("densepose_coco_2014_minival_100",)
TEST:
EXPECTED_RESULTS: [["bbox", "AP", 59.27, 0.025], ["densepose_gps", "AP", 60.11, 0.02], ["densepose_gpsm", "AP", 64.20, 0.02]]

View File

@@ -0,0 +1,9 @@
_BASE_: "../Base-DensePose-RCNN-FPN.yaml"
MODEL:
WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
DATASETS:
TRAIN: ("densepose_coco_2014_minival_100",)
TEST: ("densepose_coco_2014_minival_100",)
SOLVER:
MAX_ITER: 40
STEPS: (30,)

View File

@@ -0,0 +1,14 @@
_BASE_: "../Base-DensePose-RCNN-FPN.yaml"
MODEL:
WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
ROI_HEADS:
NUM_CLASSES: 1
DATASETS:
TRAIN: ("densepose_coco_2014_minival",)
TEST: ("densepose_coco_2014_minival",)
SOLVER:
MAX_ITER: 6000
STEPS: (5500, 5800)
TEST:
EXPECTED_RESULTS: [["bbox", "AP", 58.27, 1.0], ["densepose_gps", "AP", 42.47, 1.5], ["densepose_gpsm", "AP", 49.20, 1.5]]

View File

@@ -0,0 +1,9 @@
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
from .data.datasets import builtin # just to register data
from .config import add_densepose_config, add_dataset_category_config
from .densepose_head import ROI_DENSEPOSE_HEAD_REGISTRY
from .evaluator import DensePoseCOCOEvaluator
from .roi_head import DensePoseROIHeads
from .data.structures import DensePoseDataRelative, DensePoseList, DensePoseTransformData
from .modeling.test_time_augmentation import DensePoseGeneralizedRCNNWithTTA
from .utils.transform import load_from_cfg

View File

@@ -0,0 +1,68 @@
# -*- coding = utf-8 -*-
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
from detectron2.config import CfgNode as CN
def add_dataset_category_config(cfg: CN):
"""
Add config for additional category-related dataset options
- category whitelisting
- category mapping
"""
_C = cfg
_C.DATASETS.CATEGORY_MAPS = CN(new_allowed=True)
_C.DATASETS.WHITELISTED_CATEGORIES = CN(new_allowed=True)
def add_densepose_config(cfg: CN):
"""
Add config for densepose head.
"""
_C = cfg
_C.MODEL.DENSEPOSE_ON = True
_C.MODEL.ROI_DENSEPOSE_HEAD = CN()
_C.MODEL.ROI_DENSEPOSE_HEAD.NAME = ""
_C.MODEL.ROI_DENSEPOSE_HEAD.NUM_STACKED_CONVS = 8
# Number of parts used for point labels
_C.MODEL.ROI_DENSEPOSE_HEAD.NUM_PATCHES = 24
_C.MODEL.ROI_DENSEPOSE_HEAD.DECONV_KERNEL = 4
_C.MODEL.ROI_DENSEPOSE_HEAD.CONV_HEAD_DIM = 512
_C.MODEL.ROI_DENSEPOSE_HEAD.CONV_HEAD_KERNEL = 3
_C.MODEL.ROI_DENSEPOSE_HEAD.UP_SCALE = 2
_C.MODEL.ROI_DENSEPOSE_HEAD.HEATMAP_SIZE = 112
_C.MODEL.ROI_DENSEPOSE_HEAD.POOLER_TYPE = "ROIAlignV2"
_C.MODEL.ROI_DENSEPOSE_HEAD.POOLER_RESOLUTION = 28
_C.MODEL.ROI_DENSEPOSE_HEAD.POOLER_SAMPLING_RATIO = 2
_C.MODEL.ROI_DENSEPOSE_HEAD.NUM_COARSE_SEGM_CHANNELS = 2 # 15 or 2
# Overlap threshold for an RoI to be considered foreground (if >= FG_IOU_THRESHOLD)
_C.MODEL.ROI_DENSEPOSE_HEAD.FG_IOU_THRESHOLD = 0.7
# Loss weights for annotation masks.(14 Parts)
_C.MODEL.ROI_DENSEPOSE_HEAD.INDEX_WEIGHTS = 5.0
# Loss weights for surface parts. (24 Parts)
_C.MODEL.ROI_DENSEPOSE_HEAD.PART_WEIGHTS = 1.0
# Loss weights for UV regression.
_C.MODEL.ROI_DENSEPOSE_HEAD.POINT_REGRESSION_WEIGHTS = 0.01
# For Decoder
_C.MODEL.ROI_DENSEPOSE_HEAD.DECODER_ON = True
_C.MODEL.ROI_DENSEPOSE_HEAD.DECODER_NUM_CLASSES = 256
_C.MODEL.ROI_DENSEPOSE_HEAD.DECODER_CONV_DIMS = 256
_C.MODEL.ROI_DENSEPOSE_HEAD.DECODER_NORM = ""
_C.MODEL.ROI_DENSEPOSE_HEAD.DECODER_COMMON_STRIDE = 4
# For DeepLab head
_C.MODEL.ROI_DENSEPOSE_HEAD.DEEPLAB = CN()
_C.MODEL.ROI_DENSEPOSE_HEAD.DEEPLAB.NORM = "GN"
_C.MODEL.ROI_DENSEPOSE_HEAD.DEEPLAB.NONLOCAL_ON = 0
# Confidences
# Enable learning confidences (variances) along with the actual values
_C.MODEL.ROI_DENSEPOSE_HEAD.UV_CONFIDENCE = CN({"ENABLED": False})
# UV confidence lower bound
_C.MODEL.ROI_DENSEPOSE_HEAD.UV_CONFIDENCE.EPSILON = 0.01
# Statistical model type for confidence learning, possible values:
# - "iid_iso": statistically independent identically distributed residuals
# with isotropic covariance
# - "indep_aniso": statistically independent residuals with anisotropic
# covariances
_C.MODEL.ROI_DENSEPOSE_HEAD.UV_CONFIDENCE.TYPE = "iid_iso"

View File

@@ -0,0 +1,9 @@
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
from .build import build_detection_test_loader, build_detection_train_loader
from .dataset_mapper import DatasetMapper
# ensure the builtin data are registered
from . import datasets
__all__ = [k for k in globals().keys() if not k.startswith("_")]

View File

@@ -0,0 +1,405 @@
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
import itertools
import logging
import numpy as np
import operator
from typing import Any, Callable, Collection, Dict, Iterable, List, Optional
import torch
from detectron2.config import CfgNode
from detectron2.data import samplers
from detectron2.data.build import (
load_proposals_into_dataset,
print_instances_class_histogram,
trivial_batch_collator,
worker_init_reset_seed,
)
from detectron2.data.catalog import DatasetCatalog, MetadataCatalog
from detectron2.data.common import AspectRatioGroupedDataset, DatasetFromList, MapDataset
from detectron2.utils.comm import get_world_size
from .dataset_mapper import DatasetMapper
from .datasets.coco import DENSEPOSE_KEYS_WITHOUT_MASK as DENSEPOSE_COCO_KEYS_WITHOUT_MASK
from .datasets.coco import DENSEPOSE_MASK_KEY as DENSEPOSE_COCO_MASK_KEY
__all__ = ["build_detection_train_loader", "build_detection_test_loader"]
Instance = Dict[str, Any]
InstancePredicate = Callable[[Instance], bool]
def _compute_num_images_per_worker(cfg: CfgNode):
num_workers = get_world_size()
images_per_batch = cfg.SOLVER.IMS_PER_BATCH
assert (
images_per_batch % num_workers == 0
), "SOLVER.IMS_PER_BATCH ({}) must be divisible by the number of workers ({}).".format(
images_per_batch, num_workers
)
assert (
images_per_batch >= num_workers
), "SOLVER.IMS_PER_BATCH ({}) must be larger than the number of workers ({}).".format(
images_per_batch, num_workers
)
images_per_worker = images_per_batch // num_workers
return images_per_worker
def _map_category_id_to_contiguous_id(dataset_name: str, dataset_dicts: Iterable[Instance]):
meta = MetadataCatalog.get(dataset_name)
for dataset_dict in dataset_dicts:
for ann in dataset_dict["annotations"]:
ann["category_id"] = meta.thing_dataset_id_to_contiguous_id[ann["category_id"]]
def _add_category_id_to_contiguous_id_maps_to_metadata(dataset_names: Iterable[str]):
# merge categories for all data
merged_categories = {}
for dataset_name in dataset_names:
meta = MetadataCatalog.get(dataset_name)
for cat_id, cat_name in meta.categories.items():
if cat_id not in merged_categories:
merged_categories[cat_id] = (cat_name, dataset_name)
continue
cat_name_other, dataset_name_other = merged_categories[cat_id]
if cat_name_other != cat_name:
raise ValueError(
f"Incompatible categories for category ID {cat_id}: "
f'dataset {dataset_name} value "{cat_name}", '
f'dataset {dataset_name_other} value "{cat_name_other}"'
)
merged_cat_id_to_cont_id = {}
for i, cat_id in enumerate(sorted(merged_categories.keys())):
merged_cat_id_to_cont_id[cat_id] = i
# add category maps to metadata
for dataset_name in dataset_names:
meta = MetadataCatalog.get(dataset_name)
categories = meta.get("categories")
meta.thing_classes = [categories[cat_id] for cat_id in sorted(categories.keys())]
meta.thing_dataset_id_to_contiguous_id = {
cat_id: merged_cat_id_to_cont_id[cat_id] for cat_id in sorted(categories.keys())
}
meta.thing_contiguous_id_to_dataset_id = {
merged_cat_id_to_cont_id[cat_id]: cat_id for cat_id in sorted(categories.keys())
}
def _maybe_create_general_keep_instance_predicate(cfg: CfgNode) -> Optional[InstancePredicate]:
def has_annotations(instance: Instance) -> bool:
return "annotations" in instance
def has_only_crowd_anotations(instance: Instance) -> bool:
for ann in instance["annotations"]:
if ann.get("is_crowd", 0) == 0:
return False
return True
def general_keep_instance_predicate(instance: Instance) -> bool:
return has_annotations(instance) and not has_only_crowd_anotations(instance)
if not cfg.DATALOADER.FILTER_EMPTY_ANNOTATIONS:
return None
return general_keep_instance_predicate
def _maybe_create_keypoints_keep_instance_predicate(cfg: CfgNode) -> Optional[InstancePredicate]:
min_num_keypoints = cfg.MODEL.ROI_KEYPOINT_HEAD.MIN_KEYPOINTS_PER_IMAGE
def has_sufficient_num_keypoints(instance: Instance) -> bool:
num_kpts = sum(
(np.array(ann["keypoints"][2::3]) > 0).sum()
for ann in instance["annotations"]
if "keypoints" in ann
)
return num_kpts >= min_num_keypoints
if cfg.MODEL.KEYPOINT_ON and (min_num_keypoints > 0):
return has_sufficient_num_keypoints
return None
def _maybe_create_mask_keep_instance_predicate(cfg: CfgNode) -> Optional[InstancePredicate]:
if not cfg.MODEL.MASK_ON:
return None
def has_mask_annotations(instance: Instance) -> bool:
return any("segmentation" in ann for ann in instance["annotations"])
return has_mask_annotations
def _maybe_create_densepose_keep_instance_predicate(cfg: CfgNode) -> Optional[InstancePredicate]:
if not cfg.MODEL.DENSEPOSE_ON:
return None
def has_densepose_annotations(instance: Instance) -> bool:
for ann in instance["annotations"]:
if all(key in ann for key in DENSEPOSE_COCO_KEYS_WITHOUT_MASK) and (
(DENSEPOSE_COCO_MASK_KEY in ann) or ("segmentation" in ann)
):
return True
return False
return has_densepose_annotations
def _maybe_create_specific_keep_instance_predicate(cfg: CfgNode) -> Optional[InstancePredicate]:
specific_predicate_creators = [
_maybe_create_keypoints_keep_instance_predicate,
_maybe_create_mask_keep_instance_predicate,
_maybe_create_densepose_keep_instance_predicate,
]
predicates = [creator(cfg) for creator in specific_predicate_creators]
predicates = [p for p in predicates if p is not None]
if not predicates:
return None
def combined_predicate(instance: Instance) -> bool:
return any(p(instance) for p in predicates)
return combined_predicate
def _get_train_keep_instance_predicate(cfg: CfgNode):
general_keep_predicate = _maybe_create_general_keep_instance_predicate(cfg)
combined_specific_keep_predicate = _maybe_create_specific_keep_instance_predicate(cfg)
def combined_general_specific_keep_predicate(instance: Instance) -> bool:
return general_keep_predicate(instance) and combined_specific_keep_predicate(instance)
if (general_keep_predicate is None) and (combined_specific_keep_predicate is None):
return None
if general_keep_predicate is None:
return combined_specific_keep_predicate
if combined_specific_keep_predicate is None:
return general_keep_predicate
return combined_general_specific_keep_predicate
def _get_test_keep_instance_predicate(cfg: CfgNode):
general_keep_predicate = _maybe_create_general_keep_instance_predicate(cfg)
return general_keep_predicate
def _maybe_filter_and_map_categories(
dataset_name: str, dataset_dicts: List[Instance]
) -> List[Instance]:
meta = MetadataCatalog.get(dataset_name)
whitelisted_categories = meta.get("whitelisted_categories")
category_map = meta.get("category_map", {})
if whitelisted_categories is None and not category_map:
return dataset_dicts
filtered_dataset_dicts = []
for dataset_dict in dataset_dicts:
anns = []
for ann in dataset_dict["annotations"]:
cat_id = ann["category_id"]
if whitelisted_categories is not None and cat_id not in whitelisted_categories:
continue
ann["category_id"] = category_map.get(cat_id, cat_id)
anns.append(ann)
dataset_dict["annotations"] = anns
filtered_dataset_dicts.append(dataset_dict)
return filtered_dataset_dicts
def _add_category_whitelists_to_metadata(cfg: CfgNode):
for dataset_name, whitelisted_cat_ids in cfg.DATASETS.WHITELISTED_CATEGORIES.items():
meta = MetadataCatalog.get(dataset_name)
meta.whitelisted_categories = whitelisted_cat_ids
logger = logging.getLogger(__name__)
logger.info(
"Whitelisted categories for dataset {}: {}".format(
dataset_name, meta.whitelisted_categories
)
)
def _add_category_maps_to_metadata(cfg: CfgNode):
for dataset_name, category_map in cfg.DATASETS.CATEGORY_MAPS.items():
category_map = {
int(cat_id_src): int(cat_id_dst) for cat_id_src, cat_id_dst in category_map.items()
}
meta = MetadataCatalog.get(dataset_name)
meta.category_map = category_map
logger = logging.getLogger(__name__)
logger.info("Category maps for dataset {}: {}".format(dataset_name, meta.category_map))
def combine_detection_dataset_dicts(
dataset_names: Collection[str],
keep_instance_predicate: Optional[InstancePredicate] = None,
proposal_files: Optional[Collection[str]] = None,
) -> List[Instance]:
"""
Load and prepare dataset dicts for training / testing
Args:
dataset_names (Collection[str]): a list of dataset names
keep_instance_predicate (Callable: Dict[str, Any] -> bool): predicate
applied to instance dicts which defines whether to keep the instance
proposal_files (Collection[str]): if given, a list of object proposal files
that match each dataset in `dataset_names`.
"""
assert len(dataset_names)
if proposal_files is None:
proposal_files = [None] * len(dataset_names)
assert len(dataset_names) == len(proposal_files)
# load annotations and dataset metadata
dataset_map = {}
for dataset_name in dataset_names:
dataset_dicts = DatasetCatalog.get(dataset_name)
dataset_map[dataset_name] = dataset_dicts
# initialize category maps
_add_category_id_to_contiguous_id_maps_to_metadata(dataset_names)
# apply category maps
all_datasets_dicts = []
for dataset_name, proposal_file in zip(dataset_names, proposal_files):
dataset_dicts = dataset_map[dataset_name]
assert len(dataset_dicts), f"Dataset '{dataset_name}' is empty!"
if proposal_file is not None:
dataset_dicts = load_proposals_into_dataset(dataset_dicts, proposal_file)
dataset_dicts = _maybe_filter_and_map_categories(dataset_name, dataset_dicts)
_map_category_id_to_contiguous_id(dataset_name, dataset_dicts)
print_instances_class_histogram(
dataset_dicts, MetadataCatalog.get(dataset_name).thing_classes
)
all_datasets_dicts.append(dataset_dicts)
if keep_instance_predicate is not None:
all_datasets_dicts_plain = [
d
for d in itertools.chain.from_iterable(all_datasets_dicts)
if keep_instance_predicate(d)
]
else:
all_datasets_dicts_plain = list(itertools.chain.from_iterable(all_datasets_dicts))
return all_datasets_dicts_plain
def build_detection_train_loader(cfg: CfgNode, mapper=None):
"""
A data loader is created in a way similar to that of Detectron2.
The main differences are:
- it allows to combine data with different but compatible object category sets
The data loader is created by the following steps:
1. Use the dataset names in config to query :class:`DatasetCatalog`, and obtain a list of dicts.
2. Start workers to work on the dicts. Each worker will:
* Map each metadata dict into another format to be consumed by the model.
* Batch them by simply putting dicts into a list.
The batched ``list[mapped_dict]`` is what this dataloader will return.
Args:
cfg (CfgNode): the config
mapper (callable): a callable which takes a sample (dict) from dataset and
returns the format to be consumed by the model.
By default it will be `DatasetMapper(cfg, True)`.
Returns:
an infinite iterator of training data
"""
images_per_worker = _compute_num_images_per_worker(cfg)
_add_category_whitelists_to_metadata(cfg)
_add_category_maps_to_metadata(cfg)
dataset_dicts = combine_detection_dataset_dicts(
cfg.DATASETS.TRAIN,
keep_instance_predicate=_get_train_keep_instance_predicate(cfg),
proposal_files=cfg.DATASETS.PROPOSAL_FILES_TRAIN if cfg.MODEL.LOAD_PROPOSALS else None,
)
dataset = DatasetFromList(dataset_dicts, copy=False)
if mapper is None:
mapper = DatasetMapper(cfg, True)
dataset = MapDataset(dataset, mapper)
sampler_name = cfg.DATALOADER.SAMPLER_TRAIN
logger = logging.getLogger(__name__)
logger.info("Using training sampler {}".format(sampler_name))
if sampler_name == "TrainingSampler":
sampler = samplers.TrainingSampler(len(dataset))
elif sampler_name == "RepeatFactorTrainingSampler":
sampler = samplers.RepeatFactorTrainingSampler(
dataset_dicts, cfg.DATALOADER.REPEAT_THRESHOLD
)
else:
raise ValueError("Unknown training sampler: {}".format(sampler_name))
if cfg.DATALOADER.ASPECT_RATIO_GROUPING:
data_loader = torch.utils.data.DataLoader(
dataset,
sampler=sampler,
num_workers=cfg.DATALOADER.NUM_WORKERS,
batch_sampler=None,
collate_fn=operator.itemgetter(0), # don't batch, but yield individual elements
worker_init_fn=worker_init_reset_seed,
) # yield individual mapped dict
data_loader = AspectRatioGroupedDataset(data_loader, images_per_worker)
else:
batch_sampler = torch.utils.data.sampler.BatchSampler(
sampler, images_per_worker, drop_last=True
)
# drop_last so the batch always have the same size
data_loader = torch.utils.data.DataLoader(
dataset,
num_workers=cfg.DATALOADER.NUM_WORKERS,
batch_sampler=batch_sampler,
collate_fn=trivial_batch_collator,
worker_init_fn=worker_init_reset_seed,
)
return data_loader
def build_detection_test_loader(cfg, dataset_name, mapper=None):
"""
Similar to `build_detection_train_loader`.
But this function uses the given `dataset_name` argument (instead of the names in cfg),
and uses batch size 1.
Args:
cfg: a detectron2 CfgNode
dataset_name (str): a name of the dataset that's available in the DatasetCatalog
mapper (callable): a callable which takes a sample (dict) from dataset
and returns the format to be consumed by the model.
By default it will be `DatasetMapper(cfg, False)`.
Returns:
DataLoader: a torch DataLoader, that loads the given detection
dataset, with test-time transformation and batching.
"""
_add_category_whitelists_to_metadata(cfg)
_add_category_maps_to_metadata(cfg)
dataset_dicts = combine_detection_dataset_dicts(
[dataset_name],
keep_instance_predicate=_get_test_keep_instance_predicate(cfg),
proposal_files=[
cfg.DATASETS.PROPOSAL_FILES_TEST[list(cfg.DATASETS.TEST).index(dataset_name)]
]
if cfg.MODEL.LOAD_PROPOSALS
else None,
)
dataset = DatasetFromList(dataset_dicts)
if mapper is None:
mapper = DatasetMapper(cfg, False)
dataset = MapDataset(dataset, mapper)
sampler = samplers.InferenceSampler(len(dataset))
# Always use 1 image per worker during inference since this is the
# standard when reporting inference time in papers.
batch_sampler = torch.utils.data.sampler.BatchSampler(sampler, 1, drop_last=False)
data_loader = torch.utils.data.DataLoader(
dataset,
num_workers=cfg.DATALOADER.NUM_WORKERS,
batch_sampler=batch_sampler,
collate_fn=trivial_batch_collator,
)
return data_loader

View File

@@ -0,0 +1,118 @@
# -*- coding: utf-8 -*-
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
import copy
import torch
from fvcore.common.file_io import PathManager
from detectron2.data import MetadataCatalog
from detectron2.data import detection_utils as utils
from detectron2.data import transforms as T
from .structures import DensePoseDataRelative, DensePoseList, DensePoseTransformData
class DatasetMapper:
"""
A customized version of `detectron2.data.DatasetMapper`
"""
def __init__(self, cfg, is_train=True):
self.tfm_gens = utils.build_transform_gen(cfg, is_train)
# fmt: off
self.img_format = cfg.INPUT.FORMAT
self.mask_on = cfg.MODEL.MASK_ON
self.keypoint_on = cfg.MODEL.KEYPOINT_ON
self.densepose_on = cfg.MODEL.DENSEPOSE_ON
assert not cfg.MODEL.LOAD_PROPOSALS, "not supported yet"
# fmt: on
if self.keypoint_on and is_train:
# Flip only makes sense in training
self.keypoint_hflip_indices = utils.create_keypoint_hflip_indices(cfg.DATASETS.TRAIN)
else:
self.keypoint_hflip_indices = None
if self.densepose_on:
densepose_transform_srcs = [
MetadataCatalog.get(ds).densepose_transform_src
for ds in cfg.DATASETS.TRAIN + cfg.DATASETS.TEST
]
assert len(densepose_transform_srcs) > 0
# TODO: check that DensePose transformation data is the same for
# all the data. Otherwise one would have to pass DB ID with
# each entry to select proper transformation data. For now, since
# all DensePose annotated data uses the same data semantics, we
# omit this check.
densepose_transform_data_fpath = PathManager.get_local_path(densepose_transform_srcs[0])
self.densepose_transform_data = DensePoseTransformData.load(
densepose_transform_data_fpath
)
self.is_train = is_train
def __call__(self, dataset_dict):
"""
Args:
dataset_dict (dict): Metadata of one image, in Detectron2 Dataset format.
Returns:
dict: a format that builtin models in detectron2 accept
"""
dataset_dict = copy.deepcopy(dataset_dict) # it will be modified by code below
image = utils.read_image(dataset_dict["file_name"], format=self.img_format)
utils.check_image_size(dataset_dict, image)
image, transforms = T.apply_transform_gens(self.tfm_gens, image)
image_shape = image.shape[:2] # h, w
dataset_dict["image"] = torch.as_tensor(image.transpose(2, 0, 1).astype("float32"))
if not self.is_train:
dataset_dict.pop("annotations", None)
return dataset_dict
for anno in dataset_dict["annotations"]:
if not self.mask_on:
anno.pop("segmentation", None)
if not self.keypoint_on:
anno.pop("keypoints", None)
# USER: Implement additional transformations if you have other types of data
# USER: Don't call transpose_densepose if you don't need
annos = [
self._transform_densepose(
utils.transform_instance_annotations(
obj, transforms, image_shape, keypoint_hflip_indices=self.keypoint_hflip_indices
),
transforms,
)
for obj in dataset_dict.pop("annotations")
if obj.get("iscrowd", 0) == 0
]
instances = utils.annotations_to_instances(annos, image_shape)
if len(annos) and "densepose" in annos[0]:
gt_densepose = [obj["densepose"] for obj in annos]
instances.gt_densepose = DensePoseList(gt_densepose, instances.gt_boxes, image_shape)
dataset_dict["instances"] = instances[instances.gt_boxes.nonempty()]
return dataset_dict
def _transform_densepose(self, annotation, transforms):
if not self.densepose_on:
return annotation
# Handle densepose annotations
is_valid, reason_not_valid = DensePoseDataRelative.validate_annotation(annotation)
if is_valid:
densepose_data = DensePoseDataRelative(annotation, cleanup=True)
densepose_data.apply_transform(transforms, self.densepose_transform_data)
annotation["densepose"] = densepose_data
else:
# logger = logging.getLogger(__name__)
# logger.debug("Could not load DensePose annotation: {}".format(reason_not_valid))
DensePoseDataRelative.cleanup_annotation(annotation)
# NOTE: annotations for certain instances may be unavailable.
# 'None' is accepted by the DensePostList data structure.
annotation["densepose"] = None
return annotation

View File

@@ -0,0 +1,5 @@
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
from . import builtin # ensure the builtin data are registered
__all__ = [k for k in globals().keys() if "builtin" not in k and not k.startswith("_")]

View File

@@ -0,0 +1,10 @@
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
from .coco import BASE_DATASETS as BASE_COCO_DATASETS
from .coco import DATASETS as COCO_DATASETS
from .coco import register_datasets as register_coco_datasets
DEFAULT_DATASETS_ROOT = "data"
register_coco_datasets(COCO_DATASETS, DEFAULT_DATASETS_ROOT)
register_coco_datasets(BASE_COCO_DATASETS, DEFAULT_DATASETS_ROOT)

View File

@@ -0,0 +1,314 @@
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
import contextlib
import io
import logging
import os
from dataclasses import dataclass
from typing import Any, Dict, Iterable, List, Optional
from fvcore.common.file_io import PathManager
from fvcore.common.timer import Timer
from detectron2.data import DatasetCatalog, MetadataCatalog
from detectron2.structures import BoxMode
DENSEPOSE_MASK_KEY = "dp_masks"
DENSEPOSE_KEYS_WITHOUT_MASK = ["dp_x", "dp_y", "dp_I", "dp_U", "dp_V"]
DENSEPOSE_KEYS = DENSEPOSE_KEYS_WITHOUT_MASK + [DENSEPOSE_MASK_KEY]
DENSEPOSE_METADATA_URL_PREFIX = "https://dl.fbaipublicfiles.com/densepose/data/"
@dataclass
class CocoDatasetInfo:
name: str
images_root: str
annotations_fpath: str
DATASETS = [
CocoDatasetInfo(
name="densepose_coco_2014_train",
images_root="coco/train2014",
annotations_fpath="coco/annotations/densepose_train2014.json",
),
CocoDatasetInfo(
name="densepose_coco_2014_minival",
images_root="coco/val2014",
annotations_fpath="coco/annotations/densepose_minival2014.json",
),
CocoDatasetInfo(
name="densepose_coco_2014_minival_100",
images_root="coco/val2014",
annotations_fpath="coco/annotations/densepose_minival2014_100.json",
),
CocoDatasetInfo(
name="densepose_coco_2014_valminusminival",
images_root="coco/val2014",
annotations_fpath="coco/annotations/densepose_valminusminival2014.json",
),
CocoDatasetInfo(
name="densepose_chimps",
images_root="densepose_evolution/densepose_chimps",
annotations_fpath="densepose_evolution/annotations/densepose_chimps_densepose.json",
),
]
BASE_DATASETS = [
CocoDatasetInfo(
name="base_coco_2017_train",
images_root="coco/train2017",
annotations_fpath="coco/annotations/instances_train2017.json",
),
CocoDatasetInfo(
name="base_coco_2017_val",
images_root="coco/val2017",
annotations_fpath="coco/annotations/instances_val2017.json",
),
CocoDatasetInfo(
name="base_coco_2017_val_100",
images_root="coco/val2017",
annotations_fpath="coco/annotations/instances_val2017_100.json",
),
]
def _is_relative_local_path(path: os.PathLike):
path_str = os.fsdecode(path)
return ("://" not in path_str) and not os.path.isabs(path)
def _maybe_prepend_base_path(base_path: Optional[os.PathLike], path: os.PathLike):
"""
Prepends the provided path with a base path prefix if:
1) base path is not None;
2) path is a local path
"""
if base_path is None:
return path
if _is_relative_local_path(path):
return os.path.join(base_path, path)
return path
def get_metadata(base_path: Optional[os.PathLike]) -> Dict[str, Any]:
"""
Returns metadata associated with COCO DensePose data
Args:
base_path: Optional[os.PathLike]
Base path used to load metadata from
Returns:
Dict[str, Any]
Metadata in the form of a dictionary
"""
meta = {
"densepose_transform_src": _maybe_prepend_base_path(
base_path, "UV_symmetry_transforms.mat"
),
"densepose_smpl_subdiv": _maybe_prepend_base_path(base_path, "SMPL_subdiv.mat"),
"densepose_smpl_subdiv_transform": _maybe_prepend_base_path(
base_path, "SMPL_SUBDIV_TRANSFORM.mat"
),
}
return meta
def _load_coco_annotations(json_file: str):
"""
Load COCO annotations from a JSON file
Args:
json_file: str
Path to the file to load annotations from
Returns:
Instance of `pycocotools.coco.COCO` that provides access to annotations
data
"""
from pycocotools.coco import COCO
logger = logging.getLogger(__name__)
timer = Timer()
with contextlib.redirect_stdout(io.StringIO()):
coco_api = COCO(json_file)
if timer.seconds() > 1:
logger.info("Loading {} takes {:.2f} seconds.".format(json_file, timer.seconds()))
return coco_api
def _add_categories_metadata(dataset_name: str, categories: Dict[str, Any]):
meta = MetadataCatalog.get(dataset_name)
meta.categories = {c["id"]: c["name"] for c in categories}
logger = logging.getLogger(__name__)
logger.info("Dataset {} categories: {}".format(dataset_name, categories))
def _verify_annotations_have_unique_ids(json_file: str, anns: List[List[Dict[str, Any]]]):
if "minival" in json_file:
# Skip validation on COCO2014 valminusminival and minival annotations
# The ratio of buggy annotations there is tiny and does not affect accuracy
# Therefore we explicitly white-list them
return
ann_ids = [ann["id"] for anns_per_image in anns for ann in anns_per_image]
assert len(set(ann_ids)) == len(ann_ids), "Annotation ids in '{}' are not unique!".format(
json_file
)
def _maybe_add_bbox(obj: Dict[str, Any], ann_dict: Dict[str, Any]):
if "bbox" not in ann_dict:
return
obj["bbox"] = ann_dict["bbox"]
obj["bbox_mode"] = BoxMode.XYWH_ABS
def _maybe_add_segm(obj: Dict[str, Any], ann_dict: Dict[str, Any]):
if "segmentation" not in ann_dict:
return
segm = ann_dict["segmentation"]
if not isinstance(segm, dict):
# filter out invalid polygons (< 3 points)
segm = [poly for poly in segm if len(poly) % 2 == 0 and len(poly) >= 6]
if len(segm) == 0:
return
obj["segmentation"] = segm
def _maybe_add_keypoints(obj: Dict[str, Any], ann_dict: Dict[str, Any]):
if "keypoints" not in ann_dict:
return
keypts = ann_dict["keypoints"] # list[int]
for idx, v in enumerate(keypts):
if idx % 3 != 2:
# COCO's segmentation coordinates are floating points in [0, H or W],
# but keypoint coordinates are integers in [0, H-1 or W-1]
# Therefore we assume the coordinates are "pixel indices" and
# add 0.5 to convert to floating point coordinates.
keypts[idx] = v + 0.5
obj["keypoints"] = keypts
def _maybe_add_densepose(obj: Dict[str, Any], ann_dict: Dict[str, Any]):
for key in DENSEPOSE_KEYS:
if key in ann_dict:
obj[key] = ann_dict[key]
def _combine_images_with_annotations(
dataset_name: str,
image_root: str,
img_datas: Iterable[Dict[str, Any]],
ann_datas: Iterable[Iterable[Dict[str, Any]]],
):
ann_keys = ["iscrowd", "category_id"]
dataset_dicts = []
for img_dict, ann_dicts in zip(img_datas, ann_datas):
record = {}
record["file_name"] = os.path.join(image_root, img_dict["file_name"])
record["height"] = img_dict["height"]
record["width"] = img_dict["width"]
record["image_id"] = img_dict["id"]
record["dataset"] = dataset_name
objs = []
for ann_dict in ann_dicts:
assert ann_dict["image_id"] == record["image_id"]
assert ann_dict.get("ignore", 0) == 0
obj = {key: ann_dict[key] for key in ann_keys if key in ann_dict}
_maybe_add_bbox(obj, ann_dict)
_maybe_add_segm(obj, ann_dict)
_maybe_add_keypoints(obj, ann_dict)
_maybe_add_densepose(obj, ann_dict)
objs.append(obj)
record["annotations"] = objs
dataset_dicts.append(record)
return dataset_dicts
def load_coco_json(annotations_json_file: str, image_root: str, dataset_name: str):
"""
Loads a JSON file with annotations in COCO instances format.
Replaces `detectron2.data.data.coco.load_coco_json` to handle metadata
in a more flexible way. Postpones category mapping to a later stage to be
able to combine several data with different (but coherent) sets of
categories.
Args:
annotations_json_file: str
Path to the JSON file with annotations in COCO instances format.
image_root: str
directory that contains all the images
dataset_name: str
the name that identifies a dataset, e.g. "densepose_coco_2014_train"
extra_annotation_keys: Optional[List[str]]
If provided, these keys are used to extract additional data from
the annotations.
"""
coco_api = _load_coco_annotations(PathManager.get_local_path(annotations_json_file))
_add_categories_metadata(dataset_name, coco_api.loadCats(coco_api.getCatIds()))
# sort indices for reproducible results
img_ids = sorted(coco_api.imgs.keys())
# imgs is a list of dicts, each looks something like:
# {'license': 4,
# 'url': 'http://farm6.staticflickr.com/5454/9413846304_881d5e5c3b_z.jpg',
# 'file_name': 'COCO_val2014_000000001268.jpg',
# 'height': 427,
# 'width': 640,
# 'date_captured': '2013-11-17 05:57:24',
# 'id': 1268}
imgs = coco_api.loadImgs(img_ids)
logger = logging.getLogger(__name__)
logger.info("Loaded {} images in COCO format from {}".format(len(imgs), annotations_json_file))
# anns is a list[list[dict]], where each dict is an annotation
# record for an object. The inner list enumerates the objects in an image
# and the outer list enumerates over images.
anns = [coco_api.imgToAnns[img_id] for img_id in img_ids]
_verify_annotations_have_unique_ids(annotations_json_file, anns)
dataset_records = _combine_images_with_annotations(dataset_name, image_root, imgs, anns)
return dataset_records
def register_dataset(dataset_data: CocoDatasetInfo, datasets_root: Optional[os.PathLike] = None):
"""
Registers provided COCO DensePose dataset
Args:
dataset_data: CocoDatasetInfo
Dataset data
datasets_root: Optional[os.PathLike]
Datasets root folder (default: None)
"""
annotations_fpath = _maybe_prepend_base_path(datasets_root, dataset_data.annotations_fpath)
images_root = _maybe_prepend_base_path(datasets_root, dataset_data.images_root)
def load_annotations():
return load_coco_json(
annotations_json_file=annotations_fpath,
image_root=images_root,
dataset_name=dataset_data.name,
)
DatasetCatalog.register(dataset_data.name, load_annotations)
MetadataCatalog.get(dataset_data.name).set(
json_file=annotations_fpath,
image_root=images_root,
**get_metadata(DENSEPOSE_METADATA_URL_PREFIX)
)
def register_datasets(
datasets_data: Iterable[CocoDatasetInfo], datasets_root: Optional[os.PathLike] = None
):
"""
Registers provided COCO DensePose data
Args:
datasets_data: Iterable[CocoDatasetInfo]
An iterable of dataset datas
datasets_root: Optional[os.PathLike]
Datasets root folder (default: None)
"""
for dataset_data in datasets_data:
register_dataset(dataset_data, datasets_root)

View File

@@ -0,0 +1,579 @@
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
import base64
import numpy as np
from io import BytesIO
import torch
from PIL import Image
from torch.nn import functional as F
class DensePoseTransformData(object):
# Horizontal symmetry label transforms used for horizontal flip
MASK_LABEL_SYMMETRIES = [0, 1, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 14]
# fmt: off
POINT_LABEL_SYMMETRIES = [ 0, 1, 2, 4, 3, 6, 5, 8, 7, 10, 9, 12, 11, 14, 13, 16, 15, 18, 17, 20, 19, 22, 21, 24, 23] # noqa
# fmt: on
def __init__(self, uv_symmetries):
self.mask_label_symmetries = DensePoseTransformData.MASK_LABEL_SYMMETRIES
self.point_label_symmetries = DensePoseTransformData.POINT_LABEL_SYMMETRIES
self.uv_symmetries = uv_symmetries
@staticmethod
def load(fpath):
import scipy.io
uv_symmetry_map = scipy.io.loadmat(fpath)
uv_symmetry_map_torch = {}
for key in ["U_transforms", "V_transforms"]:
uv_symmetry_map_torch[key] = []
map_src = uv_symmetry_map[key]
map_dst = uv_symmetry_map_torch[key]
for i in range(map_src.shape[1]):
map_dst.append(torch.from_numpy(map_src[0, i]).to(dtype=torch.float))
uv_symmetry_map_torch[key] = torch.stack(map_dst, dim=0).to(
device=torch.cuda.current_device()
)
transform_data = DensePoseTransformData(uv_symmetry_map_torch)
return transform_data
class DensePoseDataRelative(object):
"""
Dense pose relative annotations that can be applied to any bounding box:
x - normalized X coordinates [0, 255] of annotated points
y - normalized Y coordinates [0, 255] of annotated points
i - body part labels 0,...,24 for annotated points
u - body part U coordinates [0, 1] for annotated points
v - body part V coordinates [0, 1] for annotated points
segm - 256x256 segmentation mask with values 0,...,14
To obtain absolute x and y data wrt some bounding box one needs to first
divide the data by 256, multiply by the respective bounding box size
and add bounding box offset:
x_img = x0 + x_norm * w / 256.0
y_img = y0 + y_norm * h / 256.0
Segmentation masks are typically sampled to get image-based masks.
"""
# Key for normalized X coordinates in annotation dict
X_KEY = "dp_x"
# Key for normalized Y coordinates in annotation dict
Y_KEY = "dp_y"
# Key for U part coordinates in annotation dict
U_KEY = "dp_U"
# Key for V part coordinates in annotation dict
V_KEY = "dp_V"
# Key for I point labels in annotation dict
I_KEY = "dp_I"
# Key for segmentation mask in annotation dict
S_KEY = "dp_masks"
# Number of body parts in segmentation masks
N_BODY_PARTS = 14
# Number of parts in point labels
N_PART_LABELS = 24
MASK_SIZE = 256
def __init__(self, annotation, cleanup=False):
is_valid, reason_not_valid = DensePoseDataRelative.validate_annotation(annotation)
assert is_valid, "Invalid DensePose annotations: {}".format(reason_not_valid)
self.x = torch.as_tensor(annotation[DensePoseDataRelative.X_KEY])
self.y = torch.as_tensor(annotation[DensePoseDataRelative.Y_KEY])
self.i = torch.as_tensor(annotation[DensePoseDataRelative.I_KEY])
self.u = torch.as_tensor(annotation[DensePoseDataRelative.U_KEY])
self.v = torch.as_tensor(annotation[DensePoseDataRelative.V_KEY])
self.segm = DensePoseDataRelative.extract_segmentation_mask(annotation)
self.device = torch.device("cpu")
if cleanup:
DensePoseDataRelative.cleanup_annotation(annotation)
def to(self, device):
if self.device == device:
return self
new_data = DensePoseDataRelative.__new__(DensePoseDataRelative)
new_data.x = self.x
new_data.x = self.x.to(device)
new_data.y = self.y.to(device)
new_data.i = self.i.to(device)
new_data.u = self.u.to(device)
new_data.v = self.v.to(device)
new_data.segm = self.segm.to(device)
new_data.device = device
return new_data
@staticmethod
def extract_segmentation_mask(annotation):
import pycocotools.mask as mask_utils
poly_specs = annotation[DensePoseDataRelative.S_KEY]
segm = torch.zeros((DensePoseDataRelative.MASK_SIZE,) * 2, dtype=torch.float32)
for i in range(DensePoseDataRelative.N_BODY_PARTS):
poly_i = poly_specs[i]
if poly_i:
mask_i = mask_utils.decode(poly_i)
segm[mask_i > 0] = i + 1
return segm
@staticmethod
def validate_annotation(annotation):
for key in [
DensePoseDataRelative.X_KEY,
DensePoseDataRelative.Y_KEY,
DensePoseDataRelative.I_KEY,
DensePoseDataRelative.U_KEY,
DensePoseDataRelative.V_KEY,
DensePoseDataRelative.S_KEY,
]:
if key not in annotation:
return False, "no {key} data in the annotation".format(key=key)
return True, None
@staticmethod
def cleanup_annotation(annotation):
for key in [
DensePoseDataRelative.X_KEY,
DensePoseDataRelative.Y_KEY,
DensePoseDataRelative.I_KEY,
DensePoseDataRelative.U_KEY,
DensePoseDataRelative.V_KEY,
DensePoseDataRelative.S_KEY,
]:
if key in annotation:
del annotation[key]
def apply_transform(self, transforms, densepose_transform_data):
self._transform_pts(transforms, densepose_transform_data)
self._transform_segm(transforms, densepose_transform_data)
def _transform_pts(self, transforms, dp_transform_data):
import detectron2.data.transforms as T
# NOTE: This assumes that HorizFlipTransform is the only one that does flip
do_hflip = sum(isinstance(t, T.HFlipTransform) for t in transforms.transforms) % 2 == 1
if do_hflip:
self.x = self.segm.size(1) - self.x
self._flip_iuv_semantics(dp_transform_data)
def _flip_iuv_semantics(self, dp_transform_data: DensePoseTransformData) -> None:
i_old = self.i.clone()
uv_symmetries = dp_transform_data.uv_symmetries
pt_label_symmetries = dp_transform_data.point_label_symmetries
for i in range(self.N_PART_LABELS):
if i + 1 in i_old:
annot_indices_i = i_old == i + 1
if pt_label_symmetries[i + 1] != i + 1:
self.i[annot_indices_i] = pt_label_symmetries[i + 1]
u_loc = (self.u[annot_indices_i] * 255).long()
v_loc = (self.v[annot_indices_i] * 255).long()
self.u[annot_indices_i] = uv_symmetries["U_transforms"][i][v_loc, u_loc].to(
device=self.u.device
)
self.v[annot_indices_i] = uv_symmetries["V_transforms"][i][v_loc, u_loc].to(
device=self.v.device
)
def _transform_segm(self, transforms, dp_transform_data):
import detectron2.data.transforms as T
# NOTE: This assumes that HorizFlipTransform is the only one that does flip
do_hflip = sum(isinstance(t, T.HFlipTransform) for t in transforms.transforms) % 2 == 1
if do_hflip:
self.segm = torch.flip(self.segm, [1])
self._flip_segm_semantics(dp_transform_data)
def _flip_segm_semantics(self, dp_transform_data):
old_segm = self.segm.clone()
mask_label_symmetries = dp_transform_data.mask_label_symmetries
for i in range(self.N_BODY_PARTS):
if mask_label_symmetries[i + 1] != i + 1:
self.segm[old_segm == i + 1] = mask_label_symmetries[i + 1]
def normalized_coords_transform(x0, y0, w, h):
"""
Coordinates transform that maps top left corner to (-1, -1) and bottom
right corner to (1, 1). Used for torch.grid_sample to initialize the
grid
"""
def f(p):
return (2 * (p[0] - x0) / w - 1, 2 * (p[1] - y0) / h - 1)
return f
class DensePoseOutput(object):
def __init__(self, S, I, U, V, confidences):
"""
Args:
S (`torch.Tensor`): coarse segmentation tensor of size (N, A, H, W)
I (`torch.Tensor`): fine segmentation tensor of size (N, C, H, W)
U (`torch.Tensor`): U coordinates for each fine segmentation label of size (N, C, H, W)
V (`torch.Tensor`): V coordinates for each fine segmentation label of size (N, C, H, W)
confidences (dict of str -> `torch.Tensor`) estimated confidence model parameters
"""
self.S = S
self.I = I # noqa: E741
self.U = U
self.V = V
self.confidences = confidences
self._check_output_dims(S, I, U, V)
def _check_output_dims(self, S, I, U, V):
assert (
len(S.size()) == 4
), "Segmentation output should have 4 " "dimensions (NCHW), but has size {}".format(
S.size()
)
assert (
len(I.size()) == 4
), "Segmentation output should have 4 " "dimensions (NCHW), but has size {}".format(
S.size()
)
assert (
len(U.size()) == 4
), "Segmentation output should have 4 " "dimensions (NCHW), but has size {}".format(
S.size()
)
assert (
len(V.size()) == 4
), "Segmentation output should have 4 " "dimensions (NCHW), but has size {}".format(
S.size()
)
assert len(S) == len(I), (
"Number of output segmentation planes {} "
"should be equal to the number of output part index "
"planes {}".format(len(S), len(I))
)
assert S.size()[2:] == I.size()[2:], (
"Output segmentation plane size {} "
"should be equal to the output part index "
"plane size {}".format(S.size()[2:], I.size()[2:])
)
assert I.size() == U.size(), (
"Part index output shape {} "
"should be the same as U coordinates output shape {}".format(I.size(), U.size())
)
assert I.size() == V.size(), (
"Part index output shape {} "
"should be the same as V coordinates output shape {}".format(I.size(), V.size())
)
def resize(self, image_size_hw):
# do nothing - outputs are invariant to resize
pass
def _crop(self, S, I, U, V, bbox_old_xywh, bbox_new_xywh):
"""
Resample S, I, U, V from bbox_old to the cropped bbox_new
"""
x0old, y0old, wold, hold = bbox_old_xywh
x0new, y0new, wnew, hnew = bbox_new_xywh
tr_coords = normalized_coords_transform(x0old, y0old, wold, hold)
topleft = (x0new, y0new)
bottomright = (x0new + wnew, y0new + hnew)
topleft_norm = tr_coords(topleft)
bottomright_norm = tr_coords(bottomright)
hsize = S.size(1)
wsize = S.size(2)
grid = torch.meshgrid(
torch.arange(
topleft_norm[1],
bottomright_norm[1],
(bottomright_norm[1] - topleft_norm[1]) / hsize,
)[:hsize],
torch.arange(
topleft_norm[0],
bottomright_norm[0],
(bottomright_norm[0] - topleft_norm[0]) / wsize,
)[:wsize],
)
grid = torch.stack(grid, dim=2).to(S.device)
assert (
grid.size(0) == hsize
), "Resampled grid expected " "height={}, actual height={}".format(hsize, grid.size(0))
assert grid.size(1) == wsize, "Resampled grid expected " "width={}, actual width={}".format(
wsize, grid.size(1)
)
S_new = F.grid_sample(
S.unsqueeze(0),
torch.unsqueeze(grid, 0),
mode="bilinear",
padding_mode="border",
align_corners=True,
).squeeze(0)
I_new = F.grid_sample(
I.unsqueeze(0),
torch.unsqueeze(grid, 0),
mode="bilinear",
padding_mode="border",
align_corners=True,
).squeeze(0)
U_new = F.grid_sample(
U.unsqueeze(0),
torch.unsqueeze(grid, 0),
mode="bilinear",
padding_mode="border",
align_corners=True,
).squeeze(0)
V_new = F.grid_sample(
V.unsqueeze(0),
torch.unsqueeze(grid, 0),
mode="bilinear",
padding_mode="border",
align_corners=True,
).squeeze(0)
return S_new, I_new, U_new, V_new
def crop(self, indices_cropped, bboxes_old, bboxes_new):
"""
Crop outputs for selected bounding boxes to the new bounding boxes.
"""
# VK: cropping is ignored for now
# for i, ic in enumerate(indices_cropped):
# self.S[ic], self.I[ic], self.U[ic], self.V[ic] = \
# self._crop(self.S[ic], self.I[ic], self.U[ic], self.V[ic],
# bboxes_old[i], bboxes_new[i])
pass
def hflip(self, transform_data: DensePoseTransformData) -> None:
"""
Change S, I, U and V to take into account a Horizontal flip.
"""
if self.I.shape[0] > 0:
for el in "SIUV":
self.__dict__[el] = torch.flip(self.__dict__[el], [3])
self._flip_iuv_semantics_tensor(transform_data)
self._flip_segm_semantics_tensor(transform_data)
def _flip_iuv_semantics_tensor(self, dp_transform_data: DensePoseTransformData) -> None:
point_label_symmetries = dp_transform_data.point_label_symmetries
uv_symmetries = dp_transform_data.uv_symmetries
N, C, H, W = self.U.shape
u_loc = (self.U[:, 1:, :, :].clamp(0, 1) * 255).long()
v_loc = (self.V[:, 1:, :, :].clamp(0, 1) * 255).long()
Iindex = torch.arange(C - 1, device=self.U.device)[None, :, None, None].expand(
N, C - 1, H, W
)
self.U[:, 1:, :, :] = uv_symmetries["U_transforms"][Iindex, v_loc, u_loc].to(
device=self.U.device
)
self.V[:, 1:, :, :] = uv_symmetries["V_transforms"][Iindex, v_loc, u_loc].to(
device=self.V.device
)
for el in "IUV":
self.__dict__[el] = self.__dict__[el][:, point_label_symmetries, :, :]
def _flip_segm_semantics_tensor(self, dp_transform_data):
if self.S.shape[1] == DensePoseDataRelative.N_BODY_PARTS + 1:
self.S = self.S[:, dp_transform_data.mask_label_symmetries, :, :]
def to_result(self, boxes_xywh):
"""
Convert DensePose outputs to results format. Results are more compact,
but cannot be resampled any more
"""
result = DensePoseResult(boxes_xywh, self.S, self.I, self.U, self.V)
return result
def __getitem__(self, item):
if isinstance(item, int):
S_selected = self.S[item].unsqueeze(0)
I_selected = self.I[item].unsqueeze(0)
U_selected = self.U[item].unsqueeze(0)
V_selected = self.V[item].unsqueeze(0)
conf_selected = {}
for key in self.confidences:
conf_selected[key] = self.confidences[key][item].unsqueeze(0)
else:
S_selected = self.S[item]
I_selected = self.I[item]
U_selected = self.U[item]
V_selected = self.V[item]
conf_selected = {}
for key in self.confidences:
conf_selected[key] = self.confidences[key][item]
return DensePoseOutput(S_selected, I_selected, U_selected, V_selected, conf_selected)
def __str__(self):
s = "DensePoseOutput S {}, I {}, U {}, V {}".format(
list(self.S.size()), list(self.I.size()), list(self.U.size()), list(self.V.size())
)
s_conf = "confidences: [{}]".format(
", ".join([f"{key} {list(self.confidences[key].size())}" for key in self.confidences])
)
return ", ".join([s, s_conf])
def __len__(self):
return self.S.size(0)
class DensePoseResult(object):
def __init__(self, boxes_xywh, S, I, U, V):
self.results = []
self.boxes_xywh = boxes_xywh.cpu().tolist()
assert len(boxes_xywh.size()) == 2
assert boxes_xywh.size(1) == 4
for i, box_xywh in enumerate(boxes_xywh):
result_i = self._output_to_result(box_xywh, S[[i]], I[[i]], U[[i]], V[[i]])
result_numpy_i = result_i.cpu().numpy()
result_encoded_i = DensePoseResult.encode_png_data(result_numpy_i)
result_encoded_with_shape_i = (result_numpy_i.shape, result_encoded_i)
self.results.append(result_encoded_with_shape_i)
def __str__(self):
s = "DensePoseResult: N={} [{}]".format(
len(self.results), ", ".join([str(list(r[0])) for r in self.results])
)
return s
def _output_to_result(self, box_xywh, S, I, U, V):
x, y, w, h = box_xywh
w = max(int(w), 1)
h = max(int(h), 1)
result = torch.zeros([3, h, w], dtype=torch.uint8, device=U.device)
assert (
len(S.size()) == 4
), "AnnIndex tensor size should have {} " "dimensions but has {}".format(4, len(S.size()))
s_bbox = F.interpolate(S, (h, w), mode="bilinear", align_corners=False).argmax(dim=1)
assert (
len(I.size()) == 4
), "IndexUV tensor size should have {} " "dimensions but has {}".format(4, len(S.size()))
i_bbox = (
F.interpolate(I, (h, w), mode="bilinear", align_corners=False).argmax(dim=1)
* (s_bbox > 0).long()
).squeeze(0)
assert len(U.size()) == 4, "U tensor size should have {} " "dimensions but has {}".format(
4, len(U.size())
)
u_bbox = F.interpolate(U, (h, w), mode="bilinear", align_corners=False)
assert len(V.size()) == 4, "V tensor size should have {} " "dimensions but has {}".format(
4, len(V.size())
)
v_bbox = F.interpolate(V, (h, w), mode="bilinear", align_corners=False)
result[0] = i_bbox
for part_id in range(1, u_bbox.size(1)):
result[1][i_bbox == part_id] = (
(u_bbox[0, part_id][i_bbox == part_id] * 255).clamp(0, 255).to(torch.uint8)
)
result[2][i_bbox == part_id] = (
(v_bbox[0, part_id][i_bbox == part_id] * 255).clamp(0, 255).to(torch.uint8)
)
assert (
result.size(1) == h
), "Results height {} should be equal" "to bounding box height {}".format(result.size(1), h)
assert (
result.size(2) == w
), "Results width {} should be equal" "to bounding box width {}".format(result.size(2), w)
return result
@staticmethod
def encode_png_data(arr):
"""
Encode array data as a PNG image using the highest compression rate
@param arr [in] Data stored in an array of size (3, M, N) of type uint8
@return Base64-encoded string containing PNG-compressed data
"""
assert len(arr.shape) == 3, "Expected a 3D array as an input," " got a {0}D array".format(
len(arr.shape)
)
assert arr.shape[0] == 3, "Expected first array dimension of size 3," " got {0}".format(
arr.shape[0]
)
assert arr.dtype == np.uint8, "Expected an array of type np.uint8, " " got {0}".format(
arr.dtype
)
data = np.moveaxis(arr, 0, -1)
im = Image.fromarray(data)
fstream = BytesIO()
im.save(fstream, format="png", optimize=True)
s = base64.encodebytes(fstream.getvalue()).decode()
return s
@staticmethod
def decode_png_data(shape, s):
"""
Decode array data from a string that contains PNG-compressed data
@param Base64-encoded string containing PNG-compressed data
@return Data stored in an array of size (3, M, N) of type uint8
"""
fstream = BytesIO(base64.decodebytes(s.encode()))
im = Image.open(fstream)
data = np.moveaxis(np.array(im.getdata(), dtype=np.uint8), -1, 0)
return data.reshape(shape)
def __len__(self):
return len(self.results)
def __getitem__(self, item):
result_encoded = self.results[item]
bbox_xywh = self.boxes_xywh[item]
return result_encoded, bbox_xywh
class DensePoseList(object):
_TORCH_DEVICE_CPU = torch.device("cpu")
def __init__(self, densepose_datas, boxes_xyxy_abs, image_size_hw, device=_TORCH_DEVICE_CPU):
assert len(densepose_datas) == len(
boxes_xyxy_abs
), "Attempt to initialize DensePoseList with {} DensePose datas " "and {} boxes".format(
len(densepose_datas), len(boxes_xyxy_abs)
)
self.densepose_datas = []
for densepose_data in densepose_datas:
assert isinstance(densepose_data, DensePoseDataRelative) or densepose_data is None, (
"Attempt to initialize DensePoseList with DensePose datas "
"of type {}, expected DensePoseDataRelative".format(type(densepose_data))
)
densepose_data_ondevice = (
densepose_data.to(device) if densepose_data is not None else None
)
self.densepose_datas.append(densepose_data_ondevice)
self.boxes_xyxy_abs = boxes_xyxy_abs.to(device)
self.image_size_hw = image_size_hw
self.device = device
def to(self, device):
if self.device == device:
return self
return DensePoseList(self.densepose_datas, self.boxes_xyxy_abs, self.image_size_hw, device)
def __iter__(self):
return iter(self.densepose_datas)
def __len__(self):
return len(self.densepose_datas)
def __repr__(self):
s = self.__class__.__name__ + "("
s += "num_instances={}, ".format(len(self.densepose_datas))
s += "image_width={}, ".format(self.image_size_hw[1])
s += "image_height={})".format(self.image_size_hw[0])
return s
def __getitem__(self, item):
if isinstance(item, int):
densepose_data_rel = self.densepose_datas[item]
return densepose_data_rel
elif isinstance(item, slice):
densepose_datas_rel = self.densepose_datas[item]
boxes_xyxy_abs = self.boxes_xyxy_abs[item]
return DensePoseList(
densepose_datas_rel, boxes_xyxy_abs, self.image_size_hw, self.device
)
elif isinstance(item, torch.Tensor) and (item.dtype == torch.bool):
densepose_datas_rel = [self.densepose_datas[i] for i, x in enumerate(item) if x > 0]
boxes_xyxy_abs = self.boxes_xyxy_abs[item]
return DensePoseList(
densepose_datas_rel, boxes_xyxy_abs, self.image_size_hw, self.device
)
else:
densepose_datas_rel = [self.densepose_datas[i] for i in item]
boxes_xyxy_abs = self.boxes_xyxy_abs[item]
return DensePoseList(
densepose_datas_rel, boxes_xyxy_abs, self.image_size_hw, self.device
)

View File

@@ -0,0 +1,158 @@
# -*- coding: utf-8 -*-
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
import contextlib
import copy
import io
import itertools
import json
import logging
import os
from collections import OrderedDict
import torch
from fvcore.common.file_io import PathManager
from pycocotools.coco import COCO
from detectron2.data import MetadataCatalog
from detectron2.evaluation import DatasetEvaluator
from detectron2.structures import BoxMode
from detectron2.utils.comm import all_gather, is_main_process, synchronize
from detectron2.utils.logger import create_small_table
from .densepose_coco_evaluation import DensePoseCocoEval, DensePoseEvalMode
class DensePoseCOCOEvaluator(DatasetEvaluator):
def __init__(self, dataset_name, distributed, output_dir=None):
self._distributed = distributed
self._output_dir = output_dir
self._cpu_device = torch.device("cpu")
self._logger = logging.getLogger(__name__)
self._metadata = MetadataCatalog.get(dataset_name)
json_file = PathManager.get_local_path(self._metadata.json_file)
with contextlib.redirect_stdout(io.StringIO()):
self._coco_api = COCO(json_file)
def reset(self):
self._predictions = []
def process(self, inputs, outputs):
"""
Args:
inputs: the inputs to a COCO model (e.g., GeneralizedRCNN).
It is a list of dict. Each dict corresponds to an image and
contains keys like "height", "width", "file_name", "image_id".
outputs: the outputs of a COCO model. It is a list of dicts with key
"instances" that contains :class:`Instances`.
The :class:`Instances` object needs to have `densepose` field.
"""
for input, output in zip(inputs, outputs):
instances = output["instances"].to(self._cpu_device)
boxes = instances.pred_boxes.tensor.clone()
boxes = BoxMode.convert(boxes, BoxMode.XYXY_ABS, BoxMode.XYWH_ABS)
instances.pred_densepose = instances.pred_densepose.to_result(boxes)
json_results = prediction_to_json(instances, input["image_id"])
self._predictions.extend(json_results)
def evaluate(self):
if self._distributed:
synchronize()
predictions = all_gather(self._predictions)
predictions = list(itertools.chain(*predictions))
if not is_main_process():
return
else:
predictions = self._predictions
return copy.deepcopy(self._eval_predictions(predictions))
def _eval_predictions(self, predictions):
"""
Evaluate predictions on densepose.
Return results with the metrics of the tasks.
"""
self._logger.info("Preparing results for COCO format ...")
if self._output_dir:
file_path = os.path.join(self._output_dir, "coco_densepose_results.json")
with open(file_path, "w") as f:
json.dump(predictions, f)
f.flush()
os.fsync(f.fileno())
self._logger.info("Evaluating predictions ...")
res = OrderedDict()
results_gps, results_gpsm = _evaluate_predictions_on_coco(self._coco_api, predictions)
res["densepose_gps"] = results_gps
res["densepose_gpsm"] = results_gpsm
return res
def prediction_to_json(instances, img_id):
"""
Args:
instances (Instances): the output of the model
img_id (str): the image id in COCO
Returns:
list[dict]: the results in densepose evaluation format
"""
scores = instances.scores.tolist()
results = []
for k in range(len(instances)):
densepose = instances.pred_densepose[k]
result = {
"image_id": img_id,
"category_id": 1, # densepose only has one class
"bbox": densepose[1],
"score": scores[k],
"densepose": densepose,
}
results.append(result)
return results
def _evaluate_predictions_on_coco(coco_gt, coco_results):
metrics = ["AP", "AP50", "AP75", "APm", "APl"]
logger = logging.getLogger(__name__)
if len(coco_results) == 0: # cocoapi does not handle empty results very well
logger.warn("No predictions from the model! Set scores to -1")
results_gps = {metric: -1 for metric in metrics}
results_gpsm = {metric: -1 for metric in metrics}
return results_gps, results_gpsm
coco_dt = coco_gt.loadRes(coco_results)
results_gps = _evaluate_predictions_on_coco_gps(coco_gt, coco_dt, metrics)
logger.info(
"Evaluation results for densepose, GPS metric: \n" + create_small_table(results_gps)
)
results_gpsm = _evaluate_predictions_on_coco_gpsm(coco_gt, coco_dt, metrics)
logger.info(
"Evaluation results for densepose, GPSm metric: \n" + create_small_table(results_gpsm)
)
return results_gps, results_gpsm
def _evaluate_predictions_on_coco_gps(coco_gt, coco_dt, metrics):
coco_eval = DensePoseCocoEval(coco_gt, coco_dt, "densepose", dpEvalMode=DensePoseEvalMode.GPS)
coco_eval.evaluate()
coco_eval.accumulate()
coco_eval.summarize()
results = {metric: float(coco_eval.stats[idx] * 100) for idx, metric in enumerate(metrics)}
return results
def _evaluate_predictions_on_coco_gpsm(coco_gt, coco_dt, metrics):
coco_eval = DensePoseCocoEval(coco_gt, coco_dt, "densepose", dpEvalMode=DensePoseEvalMode.GPSM)
coco_eval.evaluate()
coco_eval.accumulate()
coco_eval.summarize()
results = {metric: float(coco_eval.stats[idx] * 100) for idx, metric in enumerate(metrics)}
return results

View File

@@ -0,0 +1,75 @@
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
from detectron2.modeling.test_time_augmentation import GeneralizedRCNNWithTTA
class DensePoseGeneralizedRCNNWithTTA(GeneralizedRCNNWithTTA):
def __init__(self, cfg, model, transform_data, tta_mapper=None, batch_size=1):
"""
Args:
cfg (CfgNode):
model (GeneralizedRCNN): a GeneralizedRCNN to apply TTA on.
transform_data (DensePoseTransformData): contains symmetry label
transforms used for horizontal flip
tta_mapper (callable): takes a dataset dict and returns a list of
augmented versions of the dataset dict. Defaults to
`DatasetMapperTTA(cfg)`.
batch_size (int): batch the augmented images into this batch size for inference.
"""
self._transform_data = transform_data
super().__init__(cfg=cfg, model=model, tta_mapper=tta_mapper, batch_size=batch_size)
# the implementation follows closely the one from detectron2/modeling
def _inference_one_image(self, input):
"""
Args:
input (dict): one dataset dict
Returns:
dict: one output dict
"""
augmented_inputs, aug_vars = self._get_augmented_inputs(input)
# Detect boxes from all augmented versions
with self._turn_off_roi_heads(["mask_on", "keypoint_on", "densepose_on"]):
# temporarily disable roi heads
all_boxes, all_scores, all_classes = self._get_augmented_boxes(
augmented_inputs, aug_vars
)
merged_instances = self._merge_detections(
all_boxes, all_scores, all_classes, (aug_vars["height"], aug_vars["width"])
)
if self.cfg.MODEL.MASK_ON or self.cfg.MODEL.DENSEPOSE_ON:
# Use the detected boxes to obtain new fields
augmented_instances = self._rescale_detected_boxes(
augmented_inputs, merged_instances, aug_vars
)
# run forward on the detected boxes
outputs = self._batch_inference(
augmented_inputs, augmented_instances, do_postprocess=False
)
# Delete now useless variables to avoid being out of memory
del augmented_inputs, augmented_instances, merged_instances
# average the predictions
if self.cfg.MODEL.MASK_ON:
outputs[0].pred_masks = self._reduce_pred_masks(outputs, aug_vars)
if self.cfg.MODEL.DENSEPOSE_ON:
outputs[0].pred_densepose = self._reduce_pred_densepose(outputs, aug_vars)
# postprocess
output = self._detector_postprocess(outputs[0], aug_vars)
return {"instances": output}
else:
return {"instances": merged_instances}
def _reduce_pred_densepose(self, outputs, aug_vars):
for idx, output in enumerate(outputs):
if aug_vars["do_hflip"][idx]:
output.pred_densepose.hflip(self._transform_data)
# Less memory-intensive averaging
for attr in "SIUV":
setattr(
outputs[0].pred_densepose,
attr,
sum(getattr(o.pred_densepose, attr) for o in outputs) / len(outputs),
)
return outputs[0].pred_densepose

View File

@@ -0,0 +1,213 @@
# -*- coding: utf-8 -*-
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
import numpy as np
from typing import Dict
import fvcore.nn.weight_init as weight_init
import torch
import torch.nn as nn
from torch.nn import functional as F
from detectron2.layers import Conv2d, ShapeSpec, get_norm
from detectron2.modeling import ROI_HEADS_REGISTRY, StandardROIHeads
from detectron2.modeling.poolers import ROIPooler
from detectron2.modeling.roi_heads import select_foreground_proposals
from .densepose_head import (
build_densepose_data_filter,
build_densepose_head,
build_densepose_losses,
build_densepose_predictor,
densepose_inference,
)
class Decoder(nn.Module):
"""
A semantic segmentation head described in detail in the Panoptic Feature Pyramid Networks paper
(https://arxiv.org/abs/1901.02446). It takes FPN features as input and merges information from
all levels of the FPN into single output.
"""
def __init__(self, cfg, input_shape: Dict[str, ShapeSpec], in_features):
super(Decoder, self).__init__()
# fmt: off
self.in_features = in_features
feature_strides = {k: v.stride for k, v in input_shape.items()}
feature_channels = {k: v.channels for k, v in input_shape.items()}
num_classes = cfg.MODEL.ROI_DENSEPOSE_HEAD.DECODER_NUM_CLASSES
conv_dims = cfg.MODEL.ROI_DENSEPOSE_HEAD.DECODER_CONV_DIMS
self.common_stride = cfg.MODEL.ROI_DENSEPOSE_HEAD.DECODER_COMMON_STRIDE
norm = cfg.MODEL.ROI_DENSEPOSE_HEAD.DECODER_NORM
# fmt: on
self.scale_heads = []
for in_feature in self.in_features:
head_ops = []
head_length = max(
1, int(np.log2(feature_strides[in_feature]) - np.log2(self.common_stride))
)
for k in range(head_length):
conv = Conv2d(
feature_channels[in_feature] if k == 0 else conv_dims,
conv_dims,
kernel_size=3,
stride=1,
padding=1,
bias=not norm,
norm=get_norm(norm, conv_dims),
activation=F.relu,
)
weight_init.c2_msra_fill(conv)
head_ops.append(conv)
if feature_strides[in_feature] != self.common_stride:
head_ops.append(
nn.Upsample(scale_factor=2, mode="bilinear", align_corners=False)
)
self.scale_heads.append(nn.Sequential(*head_ops))
self.add_module(in_feature, self.scale_heads[-1])
self.predictor = Conv2d(conv_dims, num_classes, kernel_size=1, stride=1, padding=0)
weight_init.c2_msra_fill(self.predictor)
def forward(self, features):
for i, _ in enumerate(self.in_features):
if i == 0:
x = self.scale_heads[i](features[i])
else:
x = x + self.scale_heads[i](features[i])
x = self.predictor(x)
return x
@ROI_HEADS_REGISTRY.register()
class DensePoseROIHeads(StandardROIHeads):
"""
A Standard ROIHeads which contains an addition of DensePose head.
"""
def __init__(self, cfg, input_shape):
super().__init__(cfg, input_shape)
self._init_densepose_head(cfg, input_shape)
def _init_densepose_head(self, cfg, input_shape):
# fmt: off
self.densepose_on = cfg.MODEL.DENSEPOSE_ON
if not self.densepose_on:
return
self.densepose_data_filter = build_densepose_data_filter(cfg)
dp_pooler_resolution = cfg.MODEL.ROI_DENSEPOSE_HEAD.POOLER_RESOLUTION
dp_pooler_sampling_ratio = cfg.MODEL.ROI_DENSEPOSE_HEAD.POOLER_SAMPLING_RATIO
dp_pooler_type = cfg.MODEL.ROI_DENSEPOSE_HEAD.POOLER_TYPE
self.use_decoder = cfg.MODEL.ROI_DENSEPOSE_HEAD.DECODER_ON
# fmt: on
if self.use_decoder:
dp_pooler_scales = (1.0 / input_shape[self.in_features[0]].stride,)
else:
dp_pooler_scales = tuple(1.0 / input_shape[k].stride for k in self.in_features)
in_channels = [input_shape[f].channels for f in self.in_features][0]
if self.use_decoder:
self.decoder = Decoder(cfg, input_shape, self.in_features)
self.densepose_pooler = ROIPooler(
output_size=dp_pooler_resolution,
scales=dp_pooler_scales,
sampling_ratio=dp_pooler_sampling_ratio,
pooler_type=dp_pooler_type,
)
self.densepose_head = build_densepose_head(cfg, in_channels)
self.densepose_predictor = build_densepose_predictor(
cfg, self.densepose_head.n_out_channels
)
self.densepose_losses = build_densepose_losses(cfg)
def _forward_densepose(self, features, instances):
"""
Forward logic of the densepose prediction branch.
Args:
features (list[Tensor]): #level input features for densepose prediction
instances (list[Instances]): the per-image instances to train/predict densepose.
In training, they can be the proposals.
In inference, they can be the predicted boxes.
Returns:
In training, a dict of losses.
In inference, update `instances` with new fields "densepose" and return it.
"""
if not self.densepose_on:
return {} if self.training else instances
features = [features[f] for f in self.in_features]
if self.training:
proposals, _ = select_foreground_proposals(instances, self.num_classes)
proposals_dp = self.densepose_data_filter(proposals)
if len(proposals_dp) > 0:
# NOTE may deadlock in DDP if certain workers have empty proposals_dp
proposal_boxes = [x.proposal_boxes for x in proposals_dp]
if self.use_decoder:
features = [self.decoder(features)]
features_dp = self.densepose_pooler(features, proposal_boxes)
densepose_head_outputs = self.densepose_head(features_dp)
densepose_outputs, _, confidences, _ = self.densepose_predictor(
densepose_head_outputs
)
densepose_loss_dict = self.densepose_losses(
proposals_dp, densepose_outputs, confidences
)
return densepose_loss_dict
else:
pred_boxes = [x.pred_boxes for x in instances]
if self.use_decoder:
features = [self.decoder(features)]
features_dp = self.densepose_pooler(features, pred_boxes)
if len(features_dp) > 0:
densepose_head_outputs = self.densepose_head(features_dp)
densepose_outputs, _, confidences, _ = self.densepose_predictor(
densepose_head_outputs
)
else:
# If no detection occurred instances
# set densepose_outputs to empty tensors
empty_tensor = torch.zeros(size=(0, 0, 0, 0), device=features_dp.device)
densepose_outputs = tuple([empty_tensor] * 4)
confidences = tuple([empty_tensor] * 4)
densepose_inference(densepose_outputs, confidences, instances)
return instances
def forward(self, images, features, proposals, targets=None):
instances, losses = super().forward(images, features, proposals, targets)
del targets, images
if self.training:
losses.update(self._forward_densepose(features, instances))
return instances, losses
def forward_with_given_boxes(self, features, instances):
"""
Use the given boxes in `instances` to produce other (non-box) per-ROI outputs.
This is useful for downstream tasks where a box is known, but need to obtain
other attributes (outputs of other heads).
Test-time augmentation also uses this.
Args:
features: same as in `forward()`
instances (list[Instances]): instances to predict other outputs. Expect the keys
"pred_boxes" and "pred_classes" to exist.
Returns:
instances (list[Instances]):
the same `Instances` objects, with extra
fields such as `pred_masks` or `pred_keypoints`.
"""
instances = super().forward_with_given_boxes(features, instances)
instances = self._forward_densepose(features, instances)
return instances

View File

@@ -0,0 +1,145 @@
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
from typing import Any, Dict, Optional, Tuple
class EntrySelector(object):
"""
Base class for entry selectors
"""
@staticmethod
def from_string(spec: str) -> "EntrySelector":
if spec == "*":
return AllEntrySelector()
return FieldEntrySelector(spec)
class AllEntrySelector(EntrySelector):
"""
Selector that accepts all entries
"""
SPECIFIER = "*"
def __call__(self, entry):
return True
class FieldEntrySelector(EntrySelector):
"""
Selector that accepts only entries that match provided field
specifier(s). Only a limited set of specifiers is supported for now:
<specifiers>::=<specifier>[<comma><specifiers>]
<specifier>::=<field_name>[<type_delim><type>]<equal><value_or_range>
<field_name> is a valid identifier
<type> ::= "int" | "str"
<equal> ::= "="
<comma> ::= ","
<type_delim> ::= ":"
<value_or_range> ::= <value> | <range>
<range> ::= <value><range_delim><value>
<range_delim> ::= "-"
<value> is a string without spaces and special symbols
(e.g. <comma>, <equal>, <type_delim>, <range_delim>)
"""
_SPEC_DELIM = ","
_TYPE_DELIM = ":"
_RANGE_DELIM = "-"
_EQUAL = "="
_ERROR_PREFIX = "Invalid field selector specifier"
class _FieldEntryValuePredicate(object):
"""
Predicate that checks strict equality for the specified entry field
"""
def __init__(self, name: str, typespec: str, value: str):
import builtins
self.name = name
self.type = getattr(builtins, typespec) if typespec is not None else str
self.value = value
def __call__(self, entry):
return entry[self.name] == self.type(self.value)
class _FieldEntryRangePredicate(object):
"""
Predicate that checks whether an entry field falls into the specified range
"""
def __init__(self, name: str, typespec: str, vmin: str, vmax: str):
import builtins
self.name = name
self.type = getattr(builtins, typespec) if typespec is not None else str
self.vmin = vmin
self.vmax = vmax
def __call__(self, entry):
return (entry[self.name] >= self.type(self.vmin)) and (
entry[self.name] <= self.type(self.vmax)
)
def __init__(self, spec: str):
self._predicates = self._parse_specifier_into_predicates(spec)
def __call__(self, entry: Dict[str, Any]):
for predicate in self._predicates:
if not predicate(entry):
return False
return True
def _parse_specifier_into_predicates(self, spec: str):
predicates = []
specs = spec.split(self._SPEC_DELIM)
for subspec in specs:
eq_idx = subspec.find(self._EQUAL)
if eq_idx > 0:
field_name_with_type = subspec[:eq_idx]
field_name, field_type = self._parse_field_name_type(field_name_with_type)
field_value_or_range = subspec[eq_idx + 1 :]
if self._is_range_spec(field_value_or_range):
vmin, vmax = self._get_range_spec(field_value_or_range)
predicate = FieldEntrySelector._FieldEntryRangePredicate(
field_name, field_type, vmin, vmax
)
else:
predicate = FieldEntrySelector._FieldEntryValuePredicate(
field_name, field_type, field_value_or_range
)
predicates.append(predicate)
elif eq_idx == 0:
self._parse_error(f'"{subspec}", field name is empty!')
else:
self._parse_error(f'"{subspec}", should have format ' "<field>=<value_or_range>!")
return predicates
def _parse_field_name_type(self, field_name_with_type: str) -> Tuple[str, Optional[str]]:
type_delim_idx = field_name_with_type.find(self._TYPE_DELIM)
if type_delim_idx > 0:
field_name = field_name_with_type[:type_delim_idx]
field_type = field_name_with_type[type_delim_idx + 1 :]
elif type_delim_idx == 0:
self._parse_error(f'"{field_name_with_type}", field name is empty!')
else:
field_name = field_name_with_type
field_type = None
return field_name, field_type
def _is_range_spec(self, field_value_or_range):
delim_idx = field_value_or_range.find(self._RANGE_DELIM)
return delim_idx > 0
def _get_range_spec(self, field_value_or_range):
if self._is_range_spec(field_value_or_range):
delim_idx = field_value_or_range.find(self._RANGE_DELIM)
vmin = field_value_or_range[:delim_idx]
vmax = field_value_or_range[delim_idx + 1 :]
return vmin, vmax
else:
self._parse_error('"field_value_or_range", range of values expected!')
def _parse_error(self, msg):
raise ValueError(f"{self._ERROR_PREFIX}: {msg}")

View File

@@ -0,0 +1,13 @@
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
import logging
def verbosity_to_level(verbosity):
if verbosity is not None:
if verbosity == 0:
return logging.WARNING
elif verbosity == 1:
return logging.INFO
elif verbosity >= 2:
return logging.DEBUG
return logging.WARNING

View File

@@ -0,0 +1,16 @@
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
from fvcore.common.file_io import PathManager
from detectron2.data import MetadataCatalog
from densepose import DensePoseTransformData
def load_for_dataset(dataset_name):
path = MetadataCatalog.get(dataset_name).densepose_transform_src
densepose_transform_data_fpath = PathManager.get_local_path(path)
return DensePoseTransformData.load(densepose_transform_data_fpath)
def load_from_cfg(cfg):
return load_for_dataset(cfg.DATASETS.TEST[0])

View File

@@ -0,0 +1,191 @@
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
import logging
import numpy as np
import cv2
import torch
Image = np.ndarray
Boxes = torch.Tensor
class MatrixVisualizer(object):
"""
Base visualizer for matrix data
"""
def __init__(
self,
inplace=True,
cmap=cv2.COLORMAP_PARULA,
val_scale=1.0,
alpha=0.7,
interp_method_matrix=cv2.INTER_LINEAR,
interp_method_mask=cv2.INTER_NEAREST,
):
self.inplace = inplace
self.cmap = cmap
self.val_scale = val_scale
self.alpha = alpha
self.interp_method_matrix = interp_method_matrix
self.interp_method_mask = interp_method_mask
def visualize(self, image_bgr, mask, matrix, bbox_xywh):
self._check_image(image_bgr)
self._check_mask_matrix(mask, matrix)
if self.inplace:
image_target_bgr = image_bgr
else:
image_target_bgr = image_bgr * 0
x, y, w, h = [int(v) for v in bbox_xywh]
if w <= 0 or h <= 0:
return image_bgr
mask, matrix = self._resize(mask, matrix, w, h)
mask_bg = np.tile((mask == 0)[:, :, np.newaxis], [1, 1, 3])
matrix_scaled = matrix.astype(np.float32) * self.val_scale
_EPSILON = 1e-6
if np.any(matrix_scaled > 255 + _EPSILON):
logger = logging.getLogger(__name__)
logger.warning(
f"Matrix has values > {255 + _EPSILON} after " f"scaling, clipping to [0..255]"
)
matrix_scaled_8u = matrix_scaled.clip(0, 255).astype(np.uint8)
matrix_vis = cv2.applyColorMap(matrix_scaled_8u, self.cmap)
matrix_vis[mask_bg] = image_target_bgr[y : y + h, x : x + w, :][mask_bg]
image_target_bgr[y : y + h, x : x + w, :] = (
image_target_bgr[y : y + h, x : x + w, :] * (1.0 - self.alpha) + matrix_vis * self.alpha
)
return image_target_bgr.astype(np.uint8)
def _resize(self, mask, matrix, w, h):
if (w != mask.shape[1]) or (h != mask.shape[0]):
mask = cv2.resize(mask, (w, h), self.interp_method_mask)
if (w != matrix.shape[1]) or (h != matrix.shape[0]):
matrix = cv2.resize(matrix, (w, h), self.interp_method_matrix)
return mask, matrix
def _check_image(self, image_rgb):
assert len(image_rgb.shape) == 3
assert image_rgb.shape[2] == 3
assert image_rgb.dtype == np.uint8
def _check_mask_matrix(self, mask, matrix):
assert len(matrix.shape) == 2
assert len(mask.shape) == 2
assert mask.dtype == np.uint8
class RectangleVisualizer(object):
_COLOR_GREEN = (18, 127, 15)
def __init__(self, color=_COLOR_GREEN, thickness=1):
self.color = color
self.thickness = thickness
def visualize(self, image_bgr, bbox_xywh, color=None, thickness=None):
x, y, w, h = bbox_xywh
color = color or self.color
thickness = thickness or self.thickness
cv2.rectangle(image_bgr, (int(x), int(y)), (int(x + w), int(y + h)), color, thickness)
return image_bgr
class PointsVisualizer(object):
_COLOR_GREEN = (18, 127, 15)
def __init__(self, color_bgr=_COLOR_GREEN, r=5):
self.color_bgr = color_bgr
self.r = r
def visualize(self, image_bgr, pts_xy, colors_bgr=None, rs=None):
for j, pt_xy in enumerate(pts_xy):
x, y = pt_xy
color_bgr = colors_bgr[j] if colors_bgr is not None else self.color_bgr
r = rs[j] if rs is not None else self.r
cv2.circle(image_bgr, (x, y), r, color_bgr, -1)
return image_bgr
class TextVisualizer(object):
_COLOR_GRAY = (218, 227, 218)
_COLOR_WHITE = (255, 255, 255)
def __init__(
self,
font_face=cv2.FONT_HERSHEY_SIMPLEX,
font_color_bgr=_COLOR_GRAY,
font_scale=0.35,
font_line_type=cv2.LINE_AA,
font_line_thickness=1,
fill_color_bgr=_COLOR_WHITE,
fill_color_transparency=1.0,
frame_color_bgr=_COLOR_WHITE,
frame_color_transparency=1.0,
frame_thickness=1,
):
self.font_face = font_face
self.font_color_bgr = font_color_bgr
self.font_scale = font_scale
self.font_line_type = font_line_type
self.font_line_thickness = font_line_thickness
self.fill_color_bgr = fill_color_bgr
self.fill_color_transparency = fill_color_transparency
self.frame_color_bgr = frame_color_bgr
self.frame_color_transparency = frame_color_transparency
self.frame_thickness = frame_thickness
def visualize(self, image_bgr, txt, topleft_xy):
txt_w, txt_h = self.get_text_size_wh(txt)
topleft_xy = tuple(map(int, topleft_xy))
x, y = topleft_xy
if self.frame_color_transparency < 1.0:
t = self.frame_thickness
image_bgr[y - t : y + txt_h + t, x - t : x + txt_w + t, :] = (
image_bgr[y - t : y + txt_h + t, x - t : x + txt_w + t, :]
* self.frame_color_transparency
+ np.array(self.frame_color_bgr) * (1.0 - self.frame_color_transparency)
).astype(np.float)
if self.fill_color_transparency < 1.0:
image_bgr[y : y + txt_h, x : x + txt_w, :] = (
image_bgr[y : y + txt_h, x : x + txt_w, :] * self.fill_color_transparency
+ np.array(self.fill_color_bgr) * (1.0 - self.fill_color_transparency)
).astype(np.float)
cv2.putText(
image_bgr,
txt,
topleft_xy,
self.font_face,
self.font_scale,
self.font_color_bgr,
self.font_line_thickness,
self.font_line_type,
)
return image_bgr
def get_text_size_wh(self, txt):
((txt_w, txt_h), _) = cv2.getTextSize(
txt, self.font_face, self.font_scale, self.font_line_thickness
)
return txt_w, txt_h
class CompoundVisualizer(object):
def __init__(self, visualizers):
self.visualizers = visualizers
def visualize(self, image_bgr, data):
assert len(data) == len(
self.visualizers
), "The number of datas {} should match the number of visualizers" " {}".format(
len(data), len(self.visualizers)
)
image = image_bgr
for i, visualizer in enumerate(self.visualizers):
image = visualizer.visualize(image, data[i])
return image
def __str__(self):
visualizer_str = ", ".join([str(v) for v in self.visualizers])
return "Compound Visualizer [{}]".format(visualizer_str)

View File

@@ -0,0 +1,37 @@
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
from .base import RectangleVisualizer, TextVisualizer
class BoundingBoxVisualizer(object):
def __init__(self):
self.rectangle_visualizer = RectangleVisualizer()
def visualize(self, image_bgr, boxes_xywh):
for bbox_xywh in boxes_xywh:
image_bgr = self.rectangle_visualizer.visualize(image_bgr, bbox_xywh)
return image_bgr
class ScoredBoundingBoxVisualizer(object):
def __init__(self, bbox_visualizer_params=None, score_visualizer_params=None):
if bbox_visualizer_params is None:
bbox_visualizer_params = {}
if score_visualizer_params is None:
score_visualizer_params = {}
self.visualizer_bbox = RectangleVisualizer(**bbox_visualizer_params)
self.visualizer_score = TextVisualizer(**score_visualizer_params)
def visualize(self, image_bgr, scored_bboxes):
boxes_xywh, box_scores = scored_bboxes
assert len(boxes_xywh) == len(
box_scores
), "Number of bounding boxes {} should be equal to the number of scores {}".format(
len(boxes_xywh), len(box_scores)
)
for i, box_xywh in enumerate(boxes_xywh):
score_i = box_scores[i]
image_bgr = self.visualizer_bbox.visualize(image_bgr, box_xywh)
score_txt = "{0:6.4f}".format(score_i)
topleft_xy = box_xywh[0], box_xywh[1]
image_bgr = self.visualizer_score.visualize(image_bgr, score_txt, topleft_xy)
return image_bgr

View File

@@ -0,0 +1,593 @@
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
import logging
import numpy as np
from typing import Iterable, Optional, Tuple
import cv2
from ..data.structures import DensePoseDataRelative, DensePoseOutput, DensePoseResult
from .base import Boxes, Image, MatrixVisualizer, PointsVisualizer
class DensePoseResultsVisualizer(object):
def visualize(self, image_bgr: Image, densepose_result: Optional[DensePoseResult]) -> Image:
if densepose_result is None:
return image_bgr
context = self.create_visualization_context(image_bgr)
for i, result_encoded_w_shape in enumerate(densepose_result.results):
iuv_arr = DensePoseResult.decode_png_data(*result_encoded_w_shape)
bbox_xywh = densepose_result.boxes_xywh[i]
self.visualize_iuv_arr(context, iuv_arr, bbox_xywh)
image_bgr = self.context_to_image_bgr(context)
return image_bgr
class DensePoseMaskedColormapResultsVisualizer(DensePoseResultsVisualizer):
def __init__(
self,
data_extractor,
segm_extractor,
inplace=True,
cmap=cv2.COLORMAP_PARULA,
alpha=0.7,
val_scale=1.0,
):
self.mask_visualizer = MatrixVisualizer(
inplace=inplace, cmap=cmap, val_scale=val_scale, alpha=alpha
)
self.data_extractor = data_extractor
self.segm_extractor = segm_extractor
def create_visualization_context(self, image_bgr: Image):
return image_bgr
def context_to_image_bgr(self, context):
return context
def get_image_bgr_from_context(self, context):
return context
def visualize_iuv_arr(self, context, iuv_arr, bbox_xywh):
image_bgr = self.get_image_bgr_from_context(context)
matrix = self.data_extractor(iuv_arr)
segm = self.segm_extractor(iuv_arr)
mask = np.zeros(matrix.shape, dtype=np.uint8)
mask[segm > 0] = 1
image_bgr = self.mask_visualizer.visualize(image_bgr, mask, matrix, bbox_xywh)
return image_bgr
def _extract_i_from_iuvarr(iuv_arr):
return iuv_arr[0, :, :]
def _extract_u_from_iuvarr(iuv_arr):
return iuv_arr[1, :, :]
def _extract_v_from_iuvarr(iuv_arr):
return iuv_arr[2, :, :]
class DensePoseResultsMplContourVisualizer(DensePoseResultsVisualizer):
def __init__(self, levels=10, **kwargs):
self.levels = levels
self.plot_args = kwargs
def create_visualization_context(self, image_bgr: Image):
import matplotlib.pyplot as plt
from matplotlib.backends.backend_agg import FigureCanvasAgg as FigureCanvas
context = {}
context["image_bgr"] = image_bgr
dpi = 100
height_inches = float(image_bgr.shape[0]) / dpi
width_inches = float(image_bgr.shape[1]) / dpi
fig = plt.figure(figsize=(width_inches, height_inches), dpi=dpi)
plt.axes([0, 0, 1, 1])
plt.axis("off")
context["fig"] = fig
canvas = FigureCanvas(fig)
context["canvas"] = canvas
extent = (0, image_bgr.shape[1], image_bgr.shape[0], 0)
plt.imshow(image_bgr[:, :, ::-1], extent=extent)
return context
def context_to_image_bgr(self, context):
fig = context["fig"]
w, h = map(int, fig.get_size_inches() * fig.get_dpi())
canvas = context["canvas"]
canvas.draw()
image_1d = np.fromstring(canvas.tostring_rgb(), dtype="uint8")
image_rgb = image_1d.reshape(h, w, 3)
image_bgr = image_rgb[:, :, ::-1].copy()
return image_bgr
def visualize_iuv_arr(self, context, iuv_arr: np.ndarray, bbox_xywh: Boxes) -> Image:
import matplotlib.pyplot as plt
u = _extract_u_from_iuvarr(iuv_arr).astype(float) / 255.0
v = _extract_v_from_iuvarr(iuv_arr).astype(float) / 255.0
extent = (
bbox_xywh[0],
bbox_xywh[0] + bbox_xywh[2],
bbox_xywh[1],
bbox_xywh[1] + bbox_xywh[3],
)
plt.contour(u, self.levels, extent=extent, **self.plot_args)
plt.contour(v, self.levels, extent=extent, **self.plot_args)
class DensePoseResultsCustomContourVisualizer(DensePoseResultsVisualizer):
"""
Contour visualization using marching squares
"""
def __init__(self, levels=10, **kwargs):
# TODO: colormap is hardcoded
cmap = cv2.COLORMAP_PARULA
if isinstance(levels, int):
self.levels = np.linspace(0, 1, levels)
else:
self.levels = levels
if "linewidths" in kwargs:
self.linewidths = kwargs["linewidths"]
else:
self.linewidths = [1] * len(self.levels)
self.plot_args = kwargs
img_colors_bgr = cv2.applyColorMap((self.levels * 255).astype(np.uint8), cmap)
self.level_colors_bgr = [
[int(v) for v in img_color_bgr.ravel()] for img_color_bgr in img_colors_bgr
]
def create_visualization_context(self, image_bgr: Image):
return image_bgr
def context_to_image_bgr(self, context):
return context
def get_image_bgr_from_context(self, context):
return context
def visualize_iuv_arr(self, context, iuv_arr: np.ndarray, bbox_xywh: Boxes) -> Image:
image_bgr = self.get_image_bgr_from_context(context)
segm = _extract_i_from_iuvarr(iuv_arr)
u = _extract_u_from_iuvarr(iuv_arr).astype(float) / 255.0
v = _extract_v_from_iuvarr(iuv_arr).astype(float) / 255.0
self._contours(image_bgr, u, segm, bbox_xywh)
self._contours(image_bgr, v, segm, bbox_xywh)
def _contours(self, image_bgr, arr, segm, bbox_xywh):
for part_idx in range(1, DensePoseDataRelative.N_PART_LABELS + 1):
mask = segm == part_idx
if not np.any(mask):
continue
arr_min = np.amin(arr[mask])
arr_max = np.amax(arr[mask])
I, J = np.nonzero(mask)
i0 = np.amin(I)
i1 = np.amax(I) + 1
j0 = np.amin(J)
j1 = np.amax(J) + 1
if (j1 == j0 + 1) or (i1 == i0 + 1):
continue
Nw = arr.shape[1] - 1
Nh = arr.shape[0] - 1
for level_idx, level in enumerate(self.levels):
if (level < arr_min) or (level > arr_max):
continue
vp = arr[i0:i1, j0:j1] >= level
bin_codes = vp[:-1, :-1] + vp[1:, :-1] * 2 + vp[1:, 1:] * 4 + vp[:-1, 1:] * 8
mp = mask[i0:i1, j0:j1]
bin_mask_codes = mp[:-1, :-1] + mp[1:, :-1] * 2 + mp[1:, 1:] * 4 + mp[:-1, 1:] * 8
it = np.nditer(bin_codes, flags=["multi_index"])
color_bgr = self.level_colors_bgr[level_idx]
linewidth = self.linewidths[level_idx]
while not it.finished:
if (it[0] != 0) and (it[0] != 15):
i, j = it.multi_index
if bin_mask_codes[i, j] != 0:
self._draw_line(
image_bgr,
arr,
mask,
level,
color_bgr,
linewidth,
it[0],
it.multi_index,
bbox_xywh,
Nw,
Nh,
(i0, j0),
)
it.iternext()
def _draw_line(
self,
image_bgr,
arr,
mask,
v,
color_bgr,
linewidth,
bin_code,
multi_idx,
bbox_xywh,
Nw,
Nh,
offset,
):
lines = self._bin_code_2_lines(arr, v, bin_code, multi_idx, Nw, Nh, offset)
x0, y0, w, h = bbox_xywh
x1 = x0 + w
y1 = y0 + h
for line in lines:
x0r, y0r = line[0]
x1r, y1r = line[1]
pt0 = (int(x0 + x0r * (x1 - x0)), int(y0 + y0r * (y1 - y0)))
pt1 = (int(x0 + x1r * (x1 - x0)), int(y0 + y1r * (y1 - y0)))
cv2.line(image_bgr, pt0, pt1, color_bgr, linewidth)
def _bin_code_2_lines(self, arr, v, bin_code, multi_idx, Nw, Nh, offset):
i0, j0 = offset
i, j = multi_idx
i += i0
j += j0
v0, v1, v2, v3 = arr[i, j], arr[i + 1, j], arr[i + 1, j + 1], arr[i, j + 1]
x0i = float(j) / Nw
y0j = float(i) / Nh
He = 1.0 / Nh
We = 1.0 / Nw
if (bin_code == 1) or (bin_code == 14):
a = (v - v0) / (v1 - v0)
b = (v - v0) / (v3 - v0)
pt1 = (x0i, y0j + a * He)
pt2 = (x0i + b * We, y0j)
return [(pt1, pt2)]
elif (bin_code == 2) or (bin_code == 13):
a = (v - v0) / (v1 - v0)
b = (v - v1) / (v2 - v1)
pt1 = (x0i, y0j + a * He)
pt2 = (x0i + b * We, y0j + He)
return [(pt1, pt2)]
elif (bin_code == 3) or (bin_code == 12):
a = (v - v0) / (v3 - v0)
b = (v - v1) / (v2 - v1)
pt1 = (x0i + a * We, y0j)
pt2 = (x0i + b * We, y0j + He)
return [(pt1, pt2)]
elif (bin_code == 4) or (bin_code == 11):
a = (v - v1) / (v2 - v1)
b = (v - v3) / (v2 - v3)
pt1 = (x0i + a * We, y0j + He)
pt2 = (x0i + We, y0j + b * He)
return [(pt1, pt2)]
elif (bin_code == 6) or (bin_code == 9):
a = (v - v0) / (v1 - v0)
b = (v - v3) / (v2 - v3)
pt1 = (x0i, y0j + a * He)
pt2 = (x0i + We, y0j + b * He)
return [(pt1, pt2)]
elif (bin_code == 7) or (bin_code == 8):
a = (v - v0) / (v3 - v0)
b = (v - v3) / (v2 - v3)
pt1 = (x0i + a * We, y0j)
pt2 = (x0i + We, y0j + b * He)
return [(pt1, pt2)]
elif bin_code == 5:
a1 = (v - v0) / (v1 - v0)
b1 = (v - v1) / (v2 - v1)
pt11 = (x0i, y0j + a1 * He)
pt12 = (x0i + b1 * We, y0j + He)
a2 = (v - v0) / (v3 - v0)
b2 = (v - v3) / (v2 - v3)
pt21 = (x0i + a2 * We, y0j)
pt22 = (x0i + We, y0j + b2 * He)
return [(pt11, pt12), (pt21, pt22)]
elif bin_code == 10:
a1 = (v - v0) / (v3 - v0)
b1 = (v - v0) / (v1 - v0)
pt11 = (x0i + a1 * We, y0j)
pt12 = (x0i, y0j + b1 * He)
a2 = (v - v1) / (v2 - v1)
b2 = (v - v3) / (v2 - v3)
pt21 = (x0i + a2 * We, y0j + He)
pt22 = (x0i + We, y0j + b2 * He)
return [(pt11, pt12), (pt21, pt22)]
return []
try:
import matplotlib
matplotlib.use("Agg")
DensePoseResultsContourVisualizer = DensePoseResultsMplContourVisualizer
except ModuleNotFoundError:
logger = logging.getLogger(__name__)
logger.warning("Could not import matplotlib, using custom contour visualizer")
DensePoseResultsContourVisualizer = DensePoseResultsCustomContourVisualizer
class DensePoseResultsFineSegmentationVisualizer(DensePoseMaskedColormapResultsVisualizer):
def __init__(self, inplace=True, cmap=cv2.COLORMAP_PARULA, alpha=0.7):
super(DensePoseResultsFineSegmentationVisualizer, self).__init__(
_extract_i_from_iuvarr,
_extract_i_from_iuvarr,
inplace,
cmap,
alpha,
val_scale=255.0 / DensePoseDataRelative.N_PART_LABELS,
)
class DensePoseResultsUVisualizer(DensePoseMaskedColormapResultsVisualizer):
def __init__(self, inplace=True, cmap=cv2.COLORMAP_PARULA, alpha=0.7):
super(DensePoseResultsUVisualizer, self).__init__(
_extract_u_from_iuvarr, _extract_i_from_iuvarr, inplace, cmap, alpha, val_scale=1.0
)
class DensePoseResultsVVisualizer(DensePoseMaskedColormapResultsVisualizer):
def __init__(self, inplace=True, cmap=cv2.COLORMAP_PARULA, alpha=0.7):
super(DensePoseResultsVVisualizer, self).__init__(
_extract_v_from_iuvarr, _extract_i_from_iuvarr, inplace, cmap, alpha, val_scale=1.0
)
class DensePoseOutputsFineSegmentationVisualizer(object):
def __init__(self, inplace=True, cmap=cv2.COLORMAP_PARULA, alpha=0.7):
self.mask_visualizer = MatrixVisualizer(
inplace=inplace,
cmap=cmap,
val_scale=255.0 / DensePoseDataRelative.N_PART_LABELS,
alpha=alpha,
)
def visualize(
self, image_bgr: Image, dp_output_with_bboxes: Optional[Tuple[DensePoseOutput, Boxes]]
) -> Image:
if dp_output_with_bboxes is None:
return image_bgr
densepose_output, bboxes_xywh = dp_output_with_bboxes
S = densepose_output.S
I = densepose_output.I # noqa
U = densepose_output.U
V = densepose_output.V
N = S.size(0)
assert N == I.size(
0
), "densepose outputs S {} and I {}" " should have equal first dim size".format(
S.size(), I.size()
)
assert N == U.size(
0
), "densepose outputs S {} and U {}" " should have equal first dim size".format(
S.size(), U.size()
)
assert N == V.size(
0
), "densepose outputs S {} and V {}" " should have equal first dim size".format(
S.size(), V.size()
)
assert N == len(
bboxes_xywh
), "number of bounding boxes {}" " should be equal to first dim size of outputs {}".format(
len(bboxes_xywh), N
)
for n in range(N):
Sn = S[n].argmax(dim=0)
In = I[n].argmax(dim=0) * (Sn > 0).long()
matrix = In.cpu().numpy().astype(np.uint8)
mask = np.zeros(matrix.shape, dtype=np.uint8)
mask[matrix > 0] = 1
bbox_xywh = bboxes_xywh[n]
image_bgr = self.mask_visualizer.visualize(image_bgr, mask, matrix, bbox_xywh)
return image_bgr
class DensePoseOutputsUVisualizer(object):
def __init__(self, inplace=True, cmap=cv2.COLORMAP_PARULA, alpha=0.7):
self.mask_visualizer = MatrixVisualizer(
inplace=inplace, cmap=cmap, val_scale=1.0, alpha=alpha
)
def visualize(
self, image_bgr: Image, dp_output_with_bboxes: Optional[Tuple[DensePoseOutput, Boxes]]
) -> Image:
if dp_output_with_bboxes is None:
return image_bgr
densepose_output, bboxes_xywh = dp_output_with_bboxes
assert isinstance(
densepose_output, DensePoseOutput
), "DensePoseOutput expected, {} encountered".format(type(densepose_output))
S = densepose_output.S
I = densepose_output.I # noqa
U = densepose_output.U
V = densepose_output.V
N = S.size(0)
assert N == I.size(
0
), "densepose outputs S {} and I {}" " should have equal first dim size".format(
S.size(), I.size()
)
assert N == U.size(
0
), "densepose outputs S {} and U {}" " should have equal first dim size".format(
S.size(), U.size()
)
assert N == V.size(
0
), "densepose outputs S {} and V {}" " should have equal first dim size".format(
S.size(), V.size()
)
assert N == len(
bboxes_xywh
), "number of bounding boxes {}" " should be equal to first dim size of outputs {}".format(
len(bboxes_xywh), N
)
for n in range(N):
Sn = S[n].argmax(dim=0)
In = I[n].argmax(dim=0) * (Sn > 0).long()
segmentation = In.cpu().numpy().astype(np.uint8)
mask = np.zeros(segmentation.shape, dtype=np.uint8)
mask[segmentation > 0] = 1
Un = U[n].cpu().numpy().astype(np.float32)
Uvis = np.zeros(segmentation.shape, dtype=np.float32)
for partId in range(Un.shape[0]):
Uvis[segmentation == partId] = Un[partId][segmentation == partId].clip(0, 1) * 255
bbox_xywh = bboxes_xywh[n]
image_bgr = self.mask_visualizer.visualize(image_bgr, mask, Uvis, bbox_xywh)
return image_bgr
class DensePoseOutputsVVisualizer(object):
def __init__(self, inplace=True, cmap=cv2.COLORMAP_PARULA, alpha=0.7):
self.mask_visualizer = MatrixVisualizer(
inplace=inplace, cmap=cmap, val_scale=1.0, alpha=alpha
)
def visualize(
self, image_bgr: Image, dp_output_with_bboxes: Optional[Tuple[DensePoseOutput, Boxes]]
) -> Image:
if dp_output_with_bboxes is None:
return image_bgr
densepose_output, bboxes_xywh = dp_output_with_bboxes
assert isinstance(
densepose_output, DensePoseOutput
), "DensePoseOutput expected, {} encountered".format(type(densepose_output))
S = densepose_output.S
I = densepose_output.I # noqa
U = densepose_output.U
V = densepose_output.V
N = S.size(0)
assert N == I.size(
0
), "densepose outputs S {} and I {}" " should have equal first dim size".format(
S.size(), I.size()
)
assert N == U.size(
0
), "densepose outputs S {} and U {}" " should have equal first dim size".format(
S.size(), U.size()
)
assert N == V.size(
0
), "densepose outputs S {} and V {}" " should have equal first dim size".format(
S.size(), V.size()
)
assert N == len(
bboxes_xywh
), "number of bounding boxes {}" " should be equal to first dim size of outputs {}".format(
len(bboxes_xywh), N
)
for n in range(N):
Sn = S[n].argmax(dim=0)
In = I[n].argmax(dim=0) * (Sn > 0).long()
segmentation = In.cpu().numpy().astype(np.uint8)
mask = np.zeros(segmentation.shape, dtype=np.uint8)
mask[segmentation > 0] = 1
Vn = V[n].cpu().numpy().astype(np.float32)
Vvis = np.zeros(segmentation.shape, dtype=np.float32)
for partId in range(Vn.size(0)):
Vvis[segmentation == partId] = Vn[partId][segmentation == partId].clip(0, 1) * 255
bbox_xywh = bboxes_xywh[n]
image_bgr = self.mask_visualizer.visualize(image_bgr, mask, Vvis, bbox_xywh)
return image_bgr
class DensePoseDataCoarseSegmentationVisualizer(object):
"""
Visualizer for ground truth segmentation
"""
def __init__(self, inplace=True, cmap=cv2.COLORMAP_PARULA, alpha=0.7):
self.mask_visualizer = MatrixVisualizer(
inplace=inplace,
cmap=cmap,
val_scale=255.0 / DensePoseDataRelative.N_BODY_PARTS,
alpha=alpha,
)
def visualize(
self,
image_bgr: Image,
bbox_densepose_datas: Optional[Tuple[Iterable[Boxes], Iterable[DensePoseDataRelative]]],
) -> Image:
if bbox_densepose_datas is None:
return image_bgr
for bbox_xywh, densepose_data in zip(*bbox_densepose_datas):
matrix = densepose_data.segm.numpy()
mask = np.zeros(matrix.shape, dtype=np.uint8)
mask[matrix > 0] = 1
image_bgr = self.mask_visualizer.visualize(image_bgr, mask, matrix, bbox_xywh.numpy())
return image_bgr
class DensePoseDataPointsVisualizer(object):
def __init__(self, densepose_data_to_value_fn=None, cmap=cv2.COLORMAP_PARULA):
self.points_visualizer = PointsVisualizer()
self.densepose_data_to_value_fn = densepose_data_to_value_fn
self.cmap = cmap
def visualize(
self,
image_bgr: Image,
bbox_densepose_datas: Optional[Tuple[Iterable[Boxes], Iterable[DensePoseDataRelative]]],
) -> Image:
if bbox_densepose_datas is None:
return image_bgr
for bbox_xywh, densepose_data in zip(*bbox_densepose_datas):
x0, y0, w, h = bbox_xywh.numpy()
x = densepose_data.x.numpy() * w / 255.0 + x0
y = densepose_data.y.numpy() * h / 255.0 + y0
pts_xy = zip(x, y)
if self.densepose_data_to_value_fn is None:
image_bgr = self.points_visualizer.visualize(image_bgr, pts_xy)
else:
v = self.densepose_data_to_value_fn(densepose_data)
img_colors_bgr = cv2.applyColorMap(v, self.cmap)
colors_bgr = [
[int(v) for v in img_color_bgr.ravel()] for img_color_bgr in img_colors_bgr
]
image_bgr = self.points_visualizer.visualize(image_bgr, pts_xy, colors_bgr)
return image_bgr
def _densepose_data_u_for_cmap(densepose_data):
u = np.clip(densepose_data.u.numpy(), 0, 1) * 255.0
return u.astype(np.uint8)
def _densepose_data_v_for_cmap(densepose_data):
v = np.clip(densepose_data.v.numpy(), 0, 1) * 255.0
return v.astype(np.uint8)
def _densepose_data_i_for_cmap(densepose_data):
i = (
np.clip(densepose_data.i.numpy(), 0.0, DensePoseDataRelative.N_PART_LABELS)
* 255.0
/ DensePoseDataRelative.N_PART_LABELS
)
return i.astype(np.uint8)
class DensePoseDataPointsUVisualizer(DensePoseDataPointsVisualizer):
def __init__(self):
super(DensePoseDataPointsUVisualizer, self).__init__(
densepose_data_to_value_fn=_densepose_data_u_for_cmap
)
class DensePoseDataPointsVVisualizer(DensePoseDataPointsVisualizer):
def __init__(self):
super(DensePoseDataPointsVVisualizer, self).__init__(
densepose_data_to_value_fn=_densepose_data_v_for_cmap
)
class DensePoseDataPointsIVisualizer(DensePoseDataPointsVisualizer):
def __init__(self):
super(DensePoseDataPointsIVisualizer, self).__init__(
densepose_data_to_value_fn=_densepose_data_i_for_cmap
)

View File

@@ -0,0 +1,152 @@
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
import logging
from typing import Sequence
import torch
from detectron2.layers.nms import batched_nms
from detectron2.structures.instances import Instances
from densepose.vis.bounding_box import BoundingBoxVisualizer, ScoredBoundingBoxVisualizer
from densepose.vis.densepose import DensePoseResultsVisualizer
from .base import CompoundVisualizer
Scores = Sequence[float]
def extract_scores_from_instances(instances: Instances, select=None):
if instances.has("scores"):
return instances.scores if select is None else instances.scores[select]
return None
def extract_boxes_xywh_from_instances(instances: Instances, select=None):
if instances.has("pred_boxes"):
boxes_xywh = instances.pred_boxes.tensor.clone()
boxes_xywh[:, 2] -= boxes_xywh[:, 0]
boxes_xywh[:, 3] -= boxes_xywh[:, 1]
return boxes_xywh if select is None else boxes_xywh[select]
return None
def create_extractor(visualizer: object):
"""
Create an extractor for the provided visualizer
"""
if isinstance(visualizer, CompoundVisualizer):
extractors = [create_extractor(v) for v in visualizer.visualizers]
return CompoundExtractor(extractors)
elif isinstance(visualizer, DensePoseResultsVisualizer):
return DensePoseResultExtractor()
elif isinstance(visualizer, ScoredBoundingBoxVisualizer):
return CompoundExtractor([extract_boxes_xywh_from_instances, extract_scores_from_instances])
elif isinstance(visualizer, BoundingBoxVisualizer):
return extract_boxes_xywh_from_instances
else:
logger = logging.getLogger(__name__)
logger.error(f"Could not create extractor for {visualizer}")
return None
class BoundingBoxExtractor(object):
"""
Extracts bounding boxes from instances
"""
def __call__(self, instances: Instances):
boxes_xywh = extract_boxes_xywh_from_instances(instances)
return boxes_xywh
class ScoredBoundingBoxExtractor(object):
"""
Extracts bounding boxes from instances
"""
def __call__(self, instances: Instances, select=None):
scores = extract_scores_from_instances(instances)
boxes_xywh = extract_boxes_xywh_from_instances(instances)
if (scores is None) or (boxes_xywh is None):
return (boxes_xywh, scores)
if select is not None:
scores = scores[select]
boxes_xywh = boxes_xywh[select]
return (boxes_xywh, scores)
class DensePoseResultExtractor(object):
"""
Extracts DensePose result from instances
"""
def __call__(self, instances: Instances, select=None):
boxes_xywh = extract_boxes_xywh_from_instances(instances)
if instances.has("pred_densepose") and (boxes_xywh is not None):
dpout = instances.pred_densepose
if select is not None:
dpout = dpout[select]
boxes_xywh = boxes_xywh[select]
return dpout.to_result(boxes_xywh)
else:
return None
class CompoundExtractor(object):
"""
Extracts data for CompoundVisualizer
"""
def __init__(self, extractors):
self.extractors = extractors
def __call__(self, instances: Instances, select=None):
datas = []
for extractor in self.extractors:
data = extractor(instances, select)
datas.append(data)
return datas
class NmsFilteredExtractor(object):
"""
Extracts data in the format accepted by NmsFilteredVisualizer
"""
def __init__(self, extractor, iou_threshold):
self.extractor = extractor
self.iou_threshold = iou_threshold
def __call__(self, instances: Instances, select=None):
scores = extract_scores_from_instances(instances)
boxes_xywh = extract_boxes_xywh_from_instances(instances)
if boxes_xywh is None:
return None
select_local_idx = batched_nms(
boxes_xywh,
scores,
torch.zeros(len(scores), dtype=torch.int32),
iou_threshold=self.iou_threshold,
).squeeze()
select_local = torch.zeros(len(boxes_xywh), dtype=torch.bool, device=boxes_xywh.device)
select_local[select_local_idx] = True
select = select_local if select is None else (select & select_local)
return self.extractor(instances, select=select)
class ScoreThresholdedExtractor(object):
"""
Extracts data in the format accepted by ScoreThresholdedVisualizer
"""
def __init__(self, extractor, min_score):
self.extractor = extractor
self.min_score = min_score
def __call__(self, instances: Instances, select=None):
scores = extract_scores_from_instances(instances)
if scores is None:
return None
select_local = scores > self.min_score
select = select_local if select is None else (select & select_local)
data = self.extractor(instances, select=select)
return data

View File

@@ -0,0 +1,7 @@
## Some scripts for developers to use, include:
- `run_instant_tests.sh`: run training for a few iterations.
- `run_inference_tests.sh`: run inference on a small dataset.
- `../../dev/linter.sh`: lint the codebase before commit
- `../../dev/parse_results.sh`: parse results from log file.

View File

@@ -0,0 +1,33 @@
#!/bin/bash -e
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
BIN="python train_net.py"
OUTPUT="inference_test_output"
NUM_GPUS=2
IMS_PER_GPU=2
IMS_PER_BATCH=$(( NUM_GPUS * IMS_PER_GPU ))
CFG_LIST=( "${@:1}" )
if [ ${#CFG_LIST[@]} -eq 0 ]; then
CFG_LIST=( ./configs/quick_schedules/*inference_acc_test.yaml )
fi
echo "========================================================================"
echo "Configs to run:"
echo "${CFG_LIST[@]}"
echo "========================================================================"
for cfg in "${CFG_LIST[@]}"; do
echo "========================================================================"
echo "Running $cfg ..."
echo "========================================================================"
$BIN \
--eval-only \
--num-gpus $NUM_GPUS \
--config-file "$cfg" \
OUTPUT_DIR "$OUTPUT" \
SOLVER.IMS_PER_BATCH $IMS_PER_BATCH
rm -rf $OUTPUT
done

View File

@@ -0,0 +1,28 @@
#!/bin/bash -e
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
BIN="python train_net.py"
OUTPUT="instant_test_output"
NUM_GPUS=2
SOLVER_IMS_PER_BATCH=$((NUM_GPUS * 2))
CFG_LIST=( "${@:1}" )
if [ ${#CFG_LIST[@]} -eq 0 ]; then
CFG_LIST=( ./configs/quick_schedules/*instant_test.yaml )
fi
echo "========================================================================"
echo "Configs to run:"
echo "${CFG_LIST[@]}"
echo "========================================================================"
for cfg in "${CFG_LIST[@]}"; do
echo "========================================================================"
echo "Running $cfg ..."
echo "========================================================================"
$BIN --num-gpus $NUM_GPUS --config-file "$cfg" \
SOLVER.IMS_PER_BATCH $SOLVER_IMS_PER_BATCH \
OUTPUT_DIR "$OUTPUT"
rm -rf "$OUTPUT"
done

View File

@@ -0,0 +1,58 @@
# Getting Started with DensePose
## Inference with Pre-trained Models
1. Pick a model and its config file from [Model Zoo](MODEL_ZOO.md), for example [densepose_rcnn_R_50_FPN_s1x.yaml](../configs/densepose_rcnn_R_50_FPN_s1x.yaml)
2. Run the [Apply Net](TOOL_APPLY_NET.md) tool to visualize the results or save the to disk. For example, to use contour visualization for DensePose, one can run:
```bash
python apply_net.py show configs/densepose_rcnn_R_50_FPN_s1x.yaml densepose_rcnn_R_50_FPN_s1x.pkl image.jpg dp_contour,bbox --output image_densepose_contour.png
```
Please see [Apply Net](TOOL_APPLY_NET.md) for more details on the tool.
## Training
First, prepare the [dataset](http://densepose.org/#dataset) into the following structure under the directory you'll run training scripts:
<pre>
datasets/coco/
annotations/
densepose_{train,minival,valminusminival}2014.json
<a href="https://dl.fbaipublicfiles.com/detectron2/densepose/densepose_minival2014_100.json">densepose_minival2014_100.json </a> (optional, for testing only)
{train,val}2014/
# image files that are mentioned in the corresponding json
</pre>
To train a model one can use the [train_net.py](../train_net.py) script.
This script was used to train all DensePose models in [Model Zoo](MODEL_ZOO.md).
For example, to launch end-to-end DensePose-RCNN training with ResNet-50 FPN backbone
on 8 GPUs following the s1x schedule, one can run
```bash
python train_net.py --config-file configs/densepose_rcnn_R_50_FPN_s1x.yaml --num-gpus 8
```
The configs are made for 8-GPU training. To train on 1 GPU, one can apply the
[linear learning rate scaling rule](https://arxiv.org/abs/1706.02677):
```bash
python train_net.py --config-file configs/densepose_rcnn_R_50_FPN_s1x.yaml \
SOLVER.IMS_PER_BATCH 2 SOLVER.BASE_LR 0.0025
```
## Evaluation
Model testing can be done in the same way as training, except for an additional flag `--eval-only` and
model location specification through `MODEL.WEIGHTS model.pth` in the command line
```bash
python train_net.py --config-file configs/densepose_rcnn_R_50_FPN_s1x.yaml \
--eval-only MODEL.WEIGHTS model.pth
```
## Tools
We provide tools which allow one to:
- easily view DensePose annotated data in a dataset;
- perform DensePose inference on a set of images;
- visualize DensePose model results;
`query_db` is a tool to print or visualize DensePose data in a dataset.
Please refer to [Query DB](TOOL_QUERY_DB.md) for more details on this tool
`apply_net` is a tool to print or visualize DensePose results.
Please refer to [Apply Net](TOOL_APPLY_NET.md) for more details on this tool

View File

@@ -0,0 +1,277 @@
# Model Zoo and Baselines
# Introduction
We provide baselines trained with Detectron2 DensePose. The corresponding
configuration files can be found in the [configs](../configs) directory.
All models were trained on COCO `train2014` + `valminusminival2014` and
evaluated on COCO `minival2014`. For the details on common settings in which
baselines were trained, please check [Detectron 2 Model Zoo](../../../MODEL_ZOO.md).
## License
All models available for download through this document are licensed under the
[Creative Commons Attribution-ShareAlike 3.0 license](https://creativecommons.org/licenses/by-sa/3.0/)
## COCO DensePose Baselines with DensePose-RCNN
### Legacy Models
Baselines trained using schedules from [Güler et al, 2018](https://arxiv.org/pdf/1802.00434.pdf)
<table><tbody>
<!-- START TABLE -->
<!-- TABLE HEADER -->
<th valign="bottom">Name</th>
<th valign="bottom">lr<br/>sched</th>
<th valign="bottom">train<br/>time<br/>(s/iter)</th>
<th valign="bottom">inference<br/>time<br/>(s/im)</th>
<th valign="bottom">train<br/>mem<br/>(GB)</th>
<th valign="bottom">box<br/>AP</th>
<th valign="bottom">dp. AP<br/>GPS</th>
<th valign="bottom">dp. AP<br/>GPSm</th>
<th valign="bottom">model id</th>
<th valign="bottom">download</th>
<!-- TABLE BODY -->
<!-- ROW: densepose_rcnn_R_50_FPN_s1x_legacy -->
<tr><td align="left"><a href="../configs/densepose_rcnn_R_50_FPN_s1x_legacy.yaml">R_50_FPN_s1x_legacy</a></td>
<td align="center">s1x</td>
<td align="center">0.307</td>
<td align="center">0.051</td>
<td align="center">3.2</td>
<td align="center">58.1</td>
<td align="center">52.1</td>
<td align="center">54.9</td>
<td align="center">164832157</td>
<td align="center"><a href="https://dl.fbaipublicfiles.com/densepose/densepose_rcnn_R_50_FPN_s1x_legacy/164832157/model_final_d366fa.pkl">model</a>&nbsp;|&nbsp;<a href="https://dl.fbaipublicfiles.com/densepose/densepose_rcnn_R_50_FPN_s1x_legacy/164832157/metrics.json">metrics</a></td>
</tr>
<!-- ROW: densepose_rcnn_R_101_FPN_s1x_legacy -->
<tr><td align="left"><a href="../configs/densepose_rcnn_R_101_FPN_s1x_legacy.yaml">R_101_FPN_s1x_legacy</a></td>
<td align="center">s1x</td>
<td align="center">0.390</td>
<td align="center">0.063</td>
<td align="center">4.3</td>
<td align="center">59.5</td>
<td align="center">53.2</td>
<td align="center">56.1</td>
<td align="center">164832182</td>
<td align="center"><a href="https://dl.fbaipublicfiles.com/densepose/densepose_rcnn_R_101_FPN_s1x_legacy/164832182/model_final_10af0e.pkl">model</a>&nbsp;|&nbsp;<a href="https://dl.fbaipublicfiles.com/densepose/densepose_rcnn_R_101_FPN_s1x_legacy/164832182/metrics.json">metrics</a></td>
</tr>
</tbody></table>
### Improved Baselines, Original Fully Convolutional Haad
These models use an improved training schedule and Panoptic FPN head from [Kirillov et al, 2019](https://arxiv.org/abs/1901.02446).
<table><tbody>
<!-- START TABLE -->
<!-- TABLE HEADER -->
<th valign="bottom">Name</th>
<th valign="bottom">lr<br/>sched</th>
<th valign="bottom">train<br/>time<br/>(s/iter)</th>
<th valign="bottom">inference<br/>time<br/>(s/im)</th>
<th valign="bottom">train<br/>mem<br/>(GB)</th>
<th valign="bottom">box<br/>AP</th>
<th valign="bottom">dp. AP<br/>GPS</th>
<th valign="bottom">dp. AP<br/>GPSm</th>
<th valign="bottom">model id</th>
<th valign="bottom">download</th>
<!-- TABLE BODY -->
<!-- ROW: densepose_rcnn_R_50_FPN_s1x -->
<tr><td align="left"><a href="../configs/densepose_rcnn_R_50_FPN_s1x.yaml">R_50_FPN_s1x</a></td>
<td align="center">s1x</td>
<td align="center">0.359</td>
<td align="center">0.066</td>
<td align="center">4.5</td>
<td align="center">61.2</td>
<td align="center">63.7</td>
<td align="center">65.3</td>
<td align="center">165712039</td>
<td align="center"><a href="https://dl.fbaipublicfiles.com/densepose/densepose_rcnn_R_50_FPN_s1x/165712039/model_final_162be9.pkl">model</a>&nbsp;|&nbsp;<a href="https://dl.fbaipublicfiles.com/densepose/densepose_rcnn_R_50_FPN_s1x/165712039/metrics.json">metrics</a></td>
</tr>
<!-- ROW: densepose_rcnn_R_101_FPN_s1x -->
<tr><td align="left"><a href="../configs/densepose_rcnn_R_101_FPN_s1x.yaml">R_101_FPN_s1x</a></td>
<td align="center">s1x</td>
<td align="center">0.428</td>
<td align="center">0.079</td>
<td align="center">5.8</td>
<td align="center">62.3</td>
<td align="center">64.5</td>
<td align="center">66.4</td>
<td align="center">165712084</td>
<td align="center"><a href="https://dl.fbaipublicfiles.com/densepose/densepose_rcnn_R_101_FPN_s1x/165712084/model_final_c6ab63.pkl">model</a>&nbsp;|&nbsp;<a href="https://dl.fbaipublicfiles.com/densepose/densepose_rcnn_R_101_FPN_s1x/165712084/metrics.json">metrics</a></td>
</tr>
</tbody></table>
### Improved Baselines, DeepLabV3 Head
These models use an improved training schedule, Panoptic FPN head from [Kirillov et al, 2019](https://arxiv.org/abs/1901.02446) and DeepLabV3 head from [Chen et al, 2017](https://arxiv.org/abs/1706.05587).
<table><tbody>
<!-- START TABLE -->
<!-- TABLE HEADER -->
<th valign="bottom">Name</th>
<th valign="bottom">lr<br/>sched</th>
<th valign="bottom">train<br/>time<br/>(s/iter)</th>
<th valign="bottom">inference<br/>time<br/>(s/im)</th>
<th valign="bottom">train<br/>mem<br/>(GB)</th>
<th valign="bottom">box<br/>AP</th>
<th valign="bottom">dp. AP<br/>GPS</th>
<th valign="bottom">dp. AP<br/>GPSm</th>
<th valign="bottom">model id</th>
<th valign="bottom">download</th>
<!-- TABLE BODY -->
<!-- ROW: densepose_rcnn_R_50_FPN_DL_s1x -->
<tr><td align="left"><a href="../configs/densepose_rcnn_R_50_FPN_DL_s1x.yaml">R_50_FPN_DL_s1x</a></td>
<td align="center">s1x</td>
<td align="center">0.392</td>
<td align="center">0.070</td>
<td align="center">6.7</td>
<td align="center">61.1</td>
<td align="center">65.6</td>
<td align="center">66.8</td>
<td align="center">165712097</td>
<td align="center"><a href="https://dl.fbaipublicfiles.com/densepose/densepose_rcnn_R_50_FPN_DL_s1x/165712097/model_final_0ed407.pkl">model</a>&nbsp;|&nbsp;<a href="https://dl.fbaipublicfiles.com/densepose/densepose_rcnn_R_50_FPN_DL_s1x/165712097/metrics.json">metrics</a></td>
</tr>
<!-- ROW: densepose_rcnn_R_101_FPN_DL_s1x -->
<tr><td align="left"><a href="../configs/densepose_rcnn_R_101_FPN_DL_s1x.yaml">R_101_FPN_DL_s1x</a></td>
<td align="center">s1x</td>
<td align="center">0.478</td>
<td align="center">0.083</td>
<td align="center">7.0</td>
<td align="center">62.3</td>
<td align="center">66.3</td>
<td align="center">67.7</td>
<td align="center">165712116</td>
<td align="center"><a href="https://dl.fbaipublicfiles.com/densepose/densepose_rcnn_R_101_FPN_DL_s1x/165712116/model_final_844d15.pkl">model</a>&nbsp;|&nbsp;<a href="https://dl.fbaipublicfiles.com/densepose/densepose_rcnn_R_101_FPN_DL_s1x/165712116/metrics.json">metrics</a></td>
</tr>
</tbody></table>
### Baselines with Confidence Estimation
These models perform additional estimation of confidence in regressed UV coodrinates, along the lines of [Neverova et al., 2019](https://papers.nips.cc/paper/8378-correlated-uncertainty-for-learning-dense-correspondences-from-noisy-labels).
<table><tbody>
<!-- START TABLE -->
<!-- TABLE HEADER -->
<th valign="bottom">Name</th>
<th valign="bottom">lr<br/>sched</th>
<th valign="bottom">train<br/>time<br/>(s/iter)</th>
<th valign="bottom">inference<br/>time<br/>(s/im)</th>
<th valign="bottom">train<br/>mem<br/>(GB)</th>
<th valign="bottom">box<br/>AP</th>
<th valign="bottom">dp. AP<br/>GPS</th>
<th valign="bottom">dp. AP<br/>GPSm</th>
<th valign="bottom">model id</th>
<th valign="bottom">download</th>
<!-- TABLE BODY -->
<!-- ROW: densepose_rcnn_R_50_FPN_WC1_s1x -->
<tr><td align="left"><a href="../configs/densepose_rcnn_R_50_FPN_WC1_s1x.yaml">R_50_FPN_WC1_s1x</a></td>
<td align="center">s1x</td>
<td align="center">0.353</td>
<td align="center">0.064</td>
<td align="center">4.6</td>
<td align="center">60.5</td>
<td align="center">64.2</td>
<td align="center">65.6</td>
<td align="center">173862049</td>
<td align="center"><a href="https://dl.fbaipublicfiles.com/densepose/densepose_rcnn_R_50_FPN_WC1_s1x/173862049/model_final_289019.pkl">model</a>&nbsp;|&nbsp;<a href="https://dl.fbaipublicfiles.com/densepose/densepose_rcnn_R_50_FPN_WC1_s1x/173862049/metrics.json">metrics</a></td>
</tr>
<!-- ROW: densepose_rcnn_R_50_FPN_WC2_s1x -->
<tr><td align="left"><a href="../configs/densepose_rcnn_R_50_FPN_WC2_s1x.yaml">R_50_FPN_WC2_s1x</a></td>
<td align="center">s1x</td>
<td align="center">0.364</td>
<td align="center">0.066</td>
<td align="center">4.8</td>
<td align="center">60.7</td>
<td align="center">64.2</td>
<td align="center">65.7</td>
<td align="center">173861455</td>
<td align="center"><a href="https://dl.fbaipublicfiles.com/densepose/densepose_rcnn_R_50_FPN_WC2_s1x/173861455/model_final_3abe14.pkl">model</a>&nbsp;|&nbsp;<a href="https://dl.fbaipublicfiles.com/densepose/densepose_rcnn_R_50_FPN_WC2_s1x/173861455/metrics.json">metrics</a></td>
</tr>
<!-- ROW: densepose_rcnn_R_50_FPN_DL_WC1_s1x -->
<tr><td align="left"><a href="../configs/densepose_rcnn_R_50_FPN_DL_WC1_s1x.yaml">R_50_FPN_DL_WC1_s1x</a></td>
<td align="center">s1x</td>
<td align="center">0.397</td>
<td align="center">0.068</td>
<td align="center">6.7</td>
<td align="center">61.1</td>
<td align="center">65.8</td>
<td align="center">67.1</td>
<td align="center">173067973</td>
<td align="center"><a href="https://dl.fbaipublicfiles.com/densepose/densepose_rcnn_R_50_FPN_DL_WC1_s1x/173067973/model_final_b1e525.pkl">model</a>&nbsp;|&nbsp;<a href="https://dl.fbaipublicfiles.com/densepose/densepose_rcnn_R_50_FPN_DL_WC1_s1x/173067973/metrics.json">metrics</a></td>
</tr>
<!-- ROW: densepose_rcnn_R_50_FPN_DL_WC2_s1x -->
<tr><td align="left"><a href="../configs/densepose_rcnn_R_50_FPN_DL_WC2_s1x.yaml">R_50_FPN_DL_WC2_s1x</a></td>
<td align="center">s1x</td>
<td align="center">0.410</td>
<td align="center">0.070</td>
<td align="center">6.8</td>
<td align="center">60.8</td>
<td align="center">65.6</td>
<td align="center">66.7</td>
<td align="center">173859335</td>
<td align="center"><a href="https://dl.fbaipublicfiles.com/densepose/densepose_rcnn_R_50_FPN_DL_WC2_s1x/173859335/model_final_60fed4.pkl">model</a>&nbsp;|&nbsp;<a href="https://dl.fbaipublicfiles.com/densepose/densepose_rcnn_R_50_FPN_DL_WC2_s1x/173859335/metrics.json">metrics</a></td>
</tr>
<!-- ROW: densepose_rcnn_R_101_FPN_WC1_s1x -->
<tr><td align="left"><a href="../configs/densepose_rcnn_R_101_FPN_WC1_s1x.yaml">R_101_FPN_WC1_s1x</a></td>
<td align="center">s1x</td>
<td align="center">0.435</td>
<td align="center">0.076</td>
<td align="center">5.7</td>
<td align="center">62.5</td>
<td align="center">64.9</td>
<td align="center">66.5</td>
<td align="center">171402969</td>
<td align="center"><a href="https://dl.fbaipublicfiles.com/densepose/densepose_rcnn_R_101_FPN_WC1_s1x/171402969/model_final_9e47f0.pkl">model</a>&nbsp;|&nbsp;<a href="https://dl.fbaipublicfiles.com/densepose/densepose_rcnn_R_101_FPN_WC1_s1x/171402969/metrics.json">metrics</a></td>
</tr>
<!-- ROW: densepose_rcnn_R_101_FPN_WC2_s1x -->
<tr><td align="left"><a href="../configs/densepose_rcnn_R_101_FPN_WC2_s1x.yaml">R_101_FPN_WC2_s1x</a></td>
<td align="center">s1x</td>
<td align="center">0.450</td>
<td align="center">0.078</td>
<td align="center">5.7</td>
<td align="center">62.3</td>
<td align="center">64.8</td>
<td align="center">66.6</td>
<td align="center">173860702</td>
<td align="center"><a href="https://dl.fbaipublicfiles.com/densepose/densepose_rcnn_R_101_FPN_WC2_s1x/173860702/model_final_5ea023.pkl">model</a>&nbsp;|&nbsp;<a href="https://dl.fbaipublicfiles.com/densepose/densepose_rcnn_R_101_FPN_WC2_s1x/173860702/metrics.json">metrics</a></td>
</tr>
<!-- ROW: densepose_rcnn_R_101_FPN_DL_WC1_s1x -->
<tr><td align="left"><a href="../configs/densepose_rcnn_R_101_FPN_DL_WC1_s1x.yaml">R_101_FPN_DL_WC1_s1x</a></td>
<td align="center">s1x</td>
<td align="center">0.479</td>
<td align="center">0.081</td>
<td align="center">7.9</td>
<td align="center">62.0</td>
<td align="center">66.2</td>
<td align="center">67.4</td>
<td align="center">173858525</td>
<td align="center"><a href="https://dl.fbaipublicfiles.com/densepose/densepose_rcnn_R_101_FPN_DL_WC1_s1x/173858525/model_final_f359f3.pkl">model</a>&nbsp;|&nbsp;<a href="https://dl.fbaipublicfiles.com/densepose/densepose_rcnn_R_101_FPN_DL_WC1_s1x/173858525/metrics.json">metrics</a></td>
</tr>
<!-- ROW: densepose_rcnn_R_101_FPN_DL_WC2_s1x -->
<tr><td align="left"><a href="../configs/densepose_rcnn_R_101_FPN_DL_WC2_s1x.yaml">R_101_FPN_DL_WC2_s1x</a></td>
<td align="center">s1x</td>
<td align="center">0.491</td>
<td align="center">0.082</td>
<td align="center">7.6</td>
<td align="center">61.7</td>
<td align="center">65.9</td>
<td align="center">67.3</td>
<td align="center">173294801</td>
<td align="center"><a href="https://dl.fbaipublicfiles.com/densepose/densepose_rcnn_R_101_FPN_DL_WC2_s1x/173294801/model_final_6e1ed1.pkl">model</a>&nbsp;|&nbsp;<a href="https://dl.fbaipublicfiles.com/densepose/densepose_rcnn_R_101_FPN_DL_WC2_s1x/173294801/metrics.json">metrics</a></td>
</tr>
</tbody></table>
## Old Baselines
It is still possible to use some baselines from [DensePose 1](https://github.com/facebookresearch/DensePose).
Below are evaluation metrics for the baselines recomputed in the current framework:
| Model | bbox AP | AP | AP50 | AP75 | APm |APl |
|-----|-----|-----|--- |--- |--- |--- |
| [`ResNet50_FPN_s1x-e2e`](https://dl.fbaipublicfiles.com/densepose/DensePose_ResNet50_FPN_s1x-e2e.pkl) | 54.673 | 48.894 | 84.963 | 50.717 | 43.132 | 50.433 |
| [`ResNet101_FPN_s1x-e2e`](https://dl.fbaipublicfiles.com/densepose/DensePose_ResNet101_FPN_s1x-e2e.pkl) | 56.032 | 51.088 | 86.250 | 55.057 | 46.542 | 52.563 |
Note: these scores are close, but not strictly equal to the ones reported in the [DensePose 1 Model Zoo](https://github.com/facebookresearch/DensePose/blob/master/MODEL_ZOO.md),
which is due to small incompatibilities between the frameworks.

View File

@@ -0,0 +1,130 @@
# Apply Net
`apply_net` is a tool to print or visualize DensePose results on a set of images.
It has two modes: `dump` to save DensePose model results to a pickle file
and `show` to visualize them on images.
## Dump Mode
The general command form is:
```bash
python apply_net.py dump [-h] [-v] [--output <dump_file>] <config> <model> <input>
```
There are three mandatory arguments:
- `<config>`, configuration file for a given model;
- `<model>`, model file with trained parameters
- `<input>`, input image file name, pattern or folder
One can additionally provide `--output` argument to define the output file name,
which defaults to `output.pkl`.
Examples:
1. Dump results of a DensePose model with ResNet-50 FPN backbone for images
in a folder `images` to file `dump.pkl`:
```bash
python apply_net.py dump configs/densepose_rcnn_R_50_FPN_s1x.yaml DensePose_ResNet50_FPN_s1x-e2e.pkl images --output dump.pkl -v
```
2. Dump results of a DensePose model with ResNet-50 FPN backbone for images
with file name matching a pattern `image*.jpg` to file `results.pkl`:
```bash
python apply_net.py dump configs/densepose_rcnn_R_50_FPN_s1x.yaml DensePose_ResNet50_FPN_s1x-e2e.pkl "image*.jpg" --output results.pkl -v
```
If you want to load the pickle file generated by the above command:
```
# make sure DensePose is in your PYTHONPATH, or use the following line to add it:
sys.path.append("/your_detectron2_path/detectron2_repo/projects/DensePose/")
f = open('/your_result_path/results.pkl', 'rb')
data = pickle.load(f)
```
The file `results.pkl` contains the list of results per image, for each image the result is a dictionary:
```
data: [{'file_name': '/your_path/image1.jpg',
'scores': tensor([0.9884]),
'pred_boxes_XYXY': tensor([[ 69.6114, 0.0000, 706.9797, 706.0000]]),
'pred_densepose': <densepose.structures.DensePoseResult object at 0x7f791b312470>},
{'file_name': '/your_path/image2.jpg',
'scores': tensor([0.9999, 0.5373, 0.3991]),
'pred_boxes_XYXY': tensor([[ 59.5734, 7.7535, 579.9311, 932.3619],
[612.9418, 686.1254, 612.9999, 704.6053],
[164.5081, 407.4034, 598.3944, 920.4266]]),
'pred_densepose': <densepose.structures.DensePoseResult object at 0x7f7071229be0>}]
```
We can use the following code, to parse the outputs of the first
detected instance on the first image.
```
img_id, instance_id = 0, 0 # Look at the first image and the first detected instance
bbox_xyxy = data[img_id]['pred_boxes_XYXY'][instance_id]
result_encoded = data[img_id]['pred_densepose'].results[instance_id]
iuv_arr = DensePoseResult.decode_png_data(*result_encoded)
```
The array `bbox_xyxy` contains (x0, y0, x1, y1) of the bounding box.
The shape of `iuv_arr` is `[3, H, W]`, where (H, W) is the shape of the bounding box.
- `iuv_arr[0,:,:]`: The patch index of image points, indicating which of the 24 surface patches the point is on.
- `iuv_arr[1,:,:]`: The U-coordinate value of image points.
- `iuv_arr[2,:,:]`: The V-coordinate value of image points.
## Visualization Mode
The general command form is:
```bash
python apply_net.py show [-h] [-v] [--min_score <score>] [--nms_thresh <threshold>] [--output <image_file>] <config> <model> <input> <visualizations>
```
There are four mandatory arguments:
- `<config>`, configuration file for a given model;
- `<model>`, model file with trained parameters
- `<input>`, input image file name, pattern or folder
- `<visualizations>`, visualizations specifier; currently available visualizations are:
* `bbox` - bounding boxes of detected persons;
* `dp_segm` - segmentation masks for detected persons;
* `dp_u` - each body part is colored according to the estimated values of the
U coordinate in part parameterization;
* `dp_v` - each body part is colored according to the estimated values of the
V coordinate in part parameterization;
* `dp_contour` - plots contours with color-coded U and V coordinates
One can additionally provide the following optional arguments:
- `--min_score` to only show detections with sufficient scores that are not lower than provided value
- `--nms_thresh` to additionally apply non-maximum suppression to detections at a given threshold
- `--output` to define visualization file name template, which defaults to `output.png`.
To distinguish output file names for different images, the tool appends 1-based entry index,
e.g. output.0001.png, output.0002.png, etc...
The following examples show how to output results of a DensePose model
with ResNet-50 FPN backbone using different visualizations for image `image.jpg`:
1. Show bounding box and segmentation:
```bash
python apply_net.py show configs/densepose_rcnn_R_50_FPN_s1x.yaml DensePose_ResNet50_FPN_s1x-e2e.pkl image.jpg bbox,dp_segm -v
```
![Bounding Box + Segmentation Visualization](images/res_bbox_dp_segm.jpg)
2. Show bounding box and estimated U coordinates for body parts:
```bash
python apply_net.py show configs/densepose_rcnn_R_50_FPN_s1x.yaml DensePose_ResNet50_FPN_s1x-e2e.pkl image.jpg bbox,dp_u -v
```
![Bounding Box + U Coordinate Visualization](images/res_bbox_dp_u.jpg)
3. Show bounding box and estimated V coordinates for body parts:
```bash
python apply_net.py show configs/densepose_rcnn_R_50_FPN_s1x.yaml DensePose_ResNet50_FPN_s1x-e2e.pkl image.jpg bbox,dp_v -v
```
![Bounding Box + V Coordinate Visualization](images/res_bbox_dp_v.jpg)
4. Show bounding box and estimated U and V coordinates via contour plots:
```bash
python apply_net.py show configs/densepose_rcnn_R_50_FPN_s1x.yaml DensePose_ResNet50_FPN_s1x-e2e.pkl image.jpg dp_contour,bbox -v
```
![Bounding Box + Contour Visualization](images/res_bbox_dp_contour.jpg)

View File

@@ -0,0 +1,105 @@
# Query Dataset
`query_db` is a tool to print or visualize DensePose data from a dataset.
It has two modes: `print` and `show` to output dataset entries to standard
output or to visualize them on images.
## Print Mode
The general command form is:
```bash
python query_db.py print [-h] [-v] [--max-entries N] <dataset> <selector>
```
There are two mandatory arguments:
- `<dataset>`, DensePose dataset specification, from which to select
the entries (e.g. `densepose_coco_2014_train`).
- `<selector>`, dataset entry selector which can be a single specification,
or a comma-separated list of specifications of the form
`field[:type]=value` for exact match with the value
or `field[:type]=min-max` for a range of values
One can additionally limit the maximum number of entries to output
by providing `--max-entries` argument.
Examples:
1. Output at most 10 first entries from the `densepose_coco_2014_train` dataset:
```bash
python query_db.py print densepose_coco_2014_train \* --max-entries 10 -v
```
2. Output all entries with `file_name` equal to `COCO_train2014_000000000036.jpg`:
```bash
python query_db.py print densepose_coco_2014_train file_name=COCO_train2014_000000000036.jpg -v
```
3. Output all entries with `image_id` between 36 and 156:
```bash
python query_db.py print densepose_coco_2014_train image_id:int=36-156 -v
```
## Visualization Mode
The general command form is:
```bash
python query_db.py show [-h] [-v] [--max-entries N] [--output <image_file>] <dataset> <selector> <visualizations>
```
There are three mandatory arguments:
- `<dataset>`, DensePose dataset specification, from which to select
the entries (e.g. `densepose_coco_2014_train`).
- `<selector>`, dataset entry selector which can be a single specification,
or a comma-separated list of specifications of the form
`field[:type]=value` for exact match with the value
or `field[:type]=min-max` for a range of values
- `<visualizations>`, visualizations specifier; currently available visualizations are:
* `bbox` - bounding boxes of annotated persons;
* `dp_i` - annotated points colored according to the containing part;
* `dp_pts` - annotated points in green color;
* `dp_segm` - segmentation masks for annotated persons;
* `dp_u` - annotated points colored according to their U coordinate in part parameterization;
* `dp_v` - annotated points colored according to their V coordinate in part parameterization;
One can additionally provide one of the two optional arguments:
- `--max_entries` to limit the maximum number of entries to visualize
- `--output` to provide visualization file name template, which defaults
to `output.png`. To distinguish file names for different dataset
entries, the tool appends 1-based entry index to the output file name,
e.g. output.0001.png, output.0002.png, etc.
The following examples show how to output different visualizations for image with `id = 322`
from `densepose_coco_2014_train` dataset:
1. Show bounding box and segmentation:
```bash
python query_db.py show densepose_coco_2014_train image_id:int=322 bbox,dp_segm -v
```
![Bounding Box + Segmentation Visualization](images/vis_bbox_dp_segm.jpg)
2. Show bounding box and points colored according to the containing part:
```bash
python query_db.py show densepose_coco_2014_train image_id:int=322 bbox,dp_i -v
```
![Bounding Box + Point Label Visualization](images/vis_bbox_dp_i.jpg)
3. Show bounding box and annotated points in green color:
```bash
python query_db.py show densepose_coco_2014_train image_id:int=322 bbox,dp_segm -v
```
![Bounding Box + Point Visualization](images/vis_bbox_dp_pts.jpg)
4. Show bounding box and annotated points colored according to their U coordinate in part parameterization:
```bash
python query_db.py show densepose_coco_2014_train image_id:int=322 bbox,dp_u -v
```
![Bounding Box + Point U Visualization](images/vis_bbox_dp_u.jpg)
5. Show bounding box and annotated points colored according to their V coordinate in part parameterization:
```bash
python query_db.py show densepose_coco_2014_train image_id:int=322 bbox,dp_v -v
```
![Bounding Box + Point V Visualization](images/vis_bbox_dp_v.jpg)

View File

@@ -0,0 +1,250 @@
#!/usr/bin/env python3
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
import argparse
import logging
import os
import sys
from timeit import default_timer as timer
from typing import Any, ClassVar, Dict, List
import torch
from fvcore.common.file_io import PathManager
from detectron2.data.catalog import DatasetCatalog
from detectron2.utils.logger import setup_logger
from densepose.data.structures import DensePoseDataRelative
from densepose.utils.dbhelper import EntrySelector
from densepose.utils.logger import verbosity_to_level
from densepose.vis.base import CompoundVisualizer
from densepose.vis.bounding_box import BoundingBoxVisualizer
from densepose.vis.densepose import (
DensePoseDataCoarseSegmentationVisualizer,
DensePoseDataPointsIVisualizer,
DensePoseDataPointsUVisualizer,
DensePoseDataPointsVisualizer,
DensePoseDataPointsVVisualizer,
)
DOC = """Query DB - a tool to print / visualize data from a database
"""
LOGGER_NAME = "query_db"
logger = logging.getLogger(LOGGER_NAME)
_ACTION_REGISTRY: Dict[str, "Action"] = {}
class Action(object):
@classmethod
def add_arguments(cls: type, parser: argparse.ArgumentParser):
parser.add_argument(
"-v",
"--verbosity",
action="count",
help="Verbose mode. Multiple -v options increase the verbosity.",
)
def register_action(cls: type):
"""
Decorator for action classes to automate action registration
"""
global _ACTION_REGISTRY
_ACTION_REGISTRY[cls.COMMAND] = cls
return cls
class EntrywiseAction(Action):
@classmethod
def add_arguments(cls: type, parser: argparse.ArgumentParser):
super(EntrywiseAction, cls).add_arguments(parser)
parser.add_argument(
"dataset", metavar="<dataset>", help="Dataset name (e.g. densepose_coco_2014_train)"
)
parser.add_argument(
"selector",
metavar="<selector>",
help="Dataset entry selector in the form field1[:type]=value1[,"
"field2[:type]=value_min-value_max...] which selects all "
"entries from the dataset that satisfy the constraints",
)
parser.add_argument(
"--max-entries", metavar="N", help="Maximum number of entries to process", type=int
)
@classmethod
def execute(cls: type, args: argparse.Namespace):
dataset = setup_dataset(args.dataset)
entry_selector = EntrySelector.from_string(args.selector)
context = cls.create_context(args)
if args.max_entries is not None:
for _, entry in zip(range(args.max_entries), dataset):
if entry_selector(entry):
cls.execute_on_entry(entry, context)
else:
for entry in dataset:
if entry_selector(entry):
cls.execute_on_entry(entry, context)
@classmethod
def create_context(cls: type, args: argparse.Namespace) -> Dict[str, Any]:
context = {}
return context
@register_action
class PrintAction(EntrywiseAction):
"""
Print action that outputs selected entries to stdout
"""
COMMAND: ClassVar[str] = "print"
@classmethod
def add_parser(cls: type, subparsers: argparse._SubParsersAction):
parser = subparsers.add_parser(cls.COMMAND, help="Output selected entries to stdout. ")
cls.add_arguments(parser)
parser.set_defaults(func=cls.execute)
@classmethod
def add_arguments(cls: type, parser: argparse.ArgumentParser):
super(PrintAction, cls).add_arguments(parser)
@classmethod
def execute_on_entry(cls: type, entry: Dict[str, Any], context: Dict[str, Any]):
import pprint
printer = pprint.PrettyPrinter(indent=2, width=200, compact=True)
printer.pprint(entry)
@register_action
class ShowAction(EntrywiseAction):
"""
Show action that visualizes selected entries on an image
"""
COMMAND: ClassVar[str] = "show"
VISUALIZERS: ClassVar[Dict[str, object]] = {
"dp_segm": DensePoseDataCoarseSegmentationVisualizer(),
"dp_i": DensePoseDataPointsIVisualizer(),
"dp_u": DensePoseDataPointsUVisualizer(),
"dp_v": DensePoseDataPointsVVisualizer(),
"dp_pts": DensePoseDataPointsVisualizer(),
"bbox": BoundingBoxVisualizer(),
}
@classmethod
def add_parser(cls: type, subparsers: argparse._SubParsersAction):
parser = subparsers.add_parser(cls.COMMAND, help="Visualize selected entries")
cls.add_arguments(parser)
parser.set_defaults(func=cls.execute)
@classmethod
def add_arguments(cls: type, parser: argparse.ArgumentParser):
super(ShowAction, cls).add_arguments(parser)
parser.add_argument(
"visualizations",
metavar="<visualizations>",
help="Comma separated list of visualizations, possible values: "
"[{}]".format(",".join(sorted(cls.VISUALIZERS.keys()))),
)
parser.add_argument(
"--output",
metavar="<image_file>",
default="output.png",
help="File name to save output to",
)
@classmethod
def execute_on_entry(cls: type, entry: Dict[str, Any], context: Dict[str, Any]):
import cv2
import numpy as np
image_fpath = PathManager.get_local_path(entry["file_name"])
image = cv2.imread(image_fpath, cv2.IMREAD_GRAYSCALE)
image = np.tile(image[:, :, np.newaxis], [1, 1, 3])
datas = cls._extract_data_for_visualizers_from_entry(context["vis_specs"], entry)
visualizer = context["visualizer"]
image_vis = visualizer.visualize(image, datas)
entry_idx = context["entry_idx"] + 1
out_fname = cls._get_out_fname(entry_idx, context["out_fname"])
cv2.imwrite(out_fname, image_vis)
logger.info(f"Output saved to {out_fname}")
context["entry_idx"] += 1
@classmethod
def _get_out_fname(cls: type, entry_idx: int, fname_base: str):
base, ext = os.path.splitext(fname_base)
return base + ".{0:04d}".format(entry_idx) + ext
@classmethod
def create_context(cls: type, args: argparse.Namespace) -> Dict[str, Any]:
vis_specs = args.visualizations.split(",")
visualizers = []
for vis_spec in vis_specs:
vis = cls.VISUALIZERS[vis_spec]
visualizers.append(vis)
context = {
"vis_specs": vis_specs,
"visualizer": CompoundVisualizer(visualizers),
"out_fname": args.output,
"entry_idx": 0,
}
return context
@classmethod
def _extract_data_for_visualizers_from_entry(
cls: type, vis_specs: List[str], entry: Dict[str, Any]
):
dp_list = []
bbox_list = []
for annotation in entry["annotations"]:
is_valid, _ = DensePoseDataRelative.validate_annotation(annotation)
if not is_valid:
continue
bbox = torch.as_tensor(annotation["bbox"])
bbox_list.append(bbox)
dp_data = DensePoseDataRelative(annotation)
dp_list.append(dp_data)
datas = []
for vis_spec in vis_specs:
datas.append(bbox_list if "bbox" == vis_spec else (bbox_list, dp_list))
return datas
def setup_dataset(dataset_name):
logger.info("Loading dataset {}".format(dataset_name))
start = timer()
dataset = DatasetCatalog.get(dataset_name)
stop = timer()
logger.info("Loaded dataset {} in {:.3f}s".format(dataset_name, stop - start))
return dataset
def create_argument_parser() -> argparse.ArgumentParser:
parser = argparse.ArgumentParser(
description=DOC,
formatter_class=lambda prog: argparse.HelpFormatter(prog, max_help_position=120),
)
parser.set_defaults(func=lambda _: parser.print_help(sys.stdout))
subparsers = parser.add_subparsers(title="Actions")
for _, action in _ACTION_REGISTRY.items():
action.add_parser(subparsers)
return parser
def main():
parser = create_argument_parser()
args = parser.parse_args()
verbosity = args.verbosity if hasattr(args, "verbosity") else None
global logger
logger = setup_logger(name=LOGGER_NAME)
logger.setLevel(verbosity_to_level(verbosity))
args.func(args)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,110 @@
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
import os
import torch
from detectron2.config import get_cfg
from detectron2.engine import default_setup
from detectron2.modeling import build_model
from densepose import add_dataset_category_config, add_densepose_config
_BASE_CONFIG_DIR = "configs"
_EVOLUTION_CONFIG_SUB_DIR = "evolution"
_QUICK_SCHEDULES_CONFIG_SUB_DIR = "quick_schedules"
_BASE_CONFIG_FILE_PREFIX = "Base-"
_CONFIG_FILE_EXT = ".yaml"
def _get_base_config_dir():
"""
Return the base directory for configurations
"""
return os.path.join(os.path.dirname(os.path.realpath(__file__)), "..", _BASE_CONFIG_DIR)
def _get_evolution_config_dir():
"""
Return the base directory for evolution configurations
"""
return os.path.join(_get_base_config_dir(), _EVOLUTION_CONFIG_SUB_DIR)
def _get_quick_schedules_config_dir():
"""
Return the base directory for quick schedules configurations
"""
return os.path.join(_get_base_config_dir(), _QUICK_SCHEDULES_CONFIG_SUB_DIR)
def _collect_config_files(config_dir):
"""
Collect all configuration files (i.e. densepose_*.yaml) directly in the specified directory
"""
start = _get_base_config_dir()
results = []
for entry in os.listdir(config_dir):
path = os.path.join(config_dir, entry)
if not os.path.isfile(path):
continue
_, ext = os.path.splitext(entry)
if ext != _CONFIG_FILE_EXT:
continue
if entry.startswith(_BASE_CONFIG_FILE_PREFIX):
continue
config_file = os.path.relpath(path, start)
results.append(config_file)
return results
def get_config_files():
"""
Get all the configuration files (relative to the base configuration directory)
"""
return _collect_config_files(_get_base_config_dir())
def get_evolution_config_files():
"""
Get all the evolution configuration files (relative to the base configuration directory)
"""
return _collect_config_files(_get_evolution_config_dir())
def get_quick_schedules_config_files():
"""
Get all the quick schedules configuration files (relative to the base configuration directory)
"""
return _collect_config_files(_get_quick_schedules_config_dir())
def _get_model_config(config_file):
"""
Load and return the configuration from the specified file (relative to the base configuration
directory)
"""
cfg = get_cfg()
add_dataset_category_config(cfg)
add_densepose_config(cfg)
path = os.path.join(_get_base_config_dir(), config_file)
cfg.merge_from_file(path)
if not torch.cuda.is_available():
cfg.MODEL_DEVICE = "cpu"
return cfg
def get_model(config_file):
"""
Get the model from the specified file (relative to the base configuration directory)
"""
cfg = _get_model_config(config_file)
return build_model(cfg)
def setup(config_file):
"""
Setup the configuration from the specified file (relative to the base configuration directory)
"""
cfg = _get_model_config(config_file)
cfg.freeze()
default_setup(cfg, {})

View File

@@ -0,0 +1,43 @@
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
import unittest
import torch
from detectron2.structures import BitMasks, Boxes, Instances
from .common import get_model
# TODO(plabatut): Modularize detectron2 tests and re-use
def make_model_inputs(image, instances=None):
if instances is None:
return {"image": image}
return {"image": image, "instances": instances}
def make_empty_instances(h, w):
instances = Instances((h, w))
instances.gt_boxes = Boxes(torch.rand(0, 4))
instances.gt_classes = torch.tensor([]).to(dtype=torch.int64)
instances.gt_masks = BitMasks(torch.rand(0, h, w))
return instances
class ModelE2ETest(unittest.TestCase):
CONFIG_PATH = ""
def setUp(self):
self.model = get_model(self.CONFIG_PATH)
def _test_eval(self, sizes):
inputs = [make_model_inputs(torch.rand(3, size[0], size[1])) for size in sizes]
self.model.eval()
self.model(inputs)
class DensePoseRCNNE2ETest(ModelE2ETest):
CONFIG_PATH = "densepose_rcnn_R_101_FPN_s1x.yaml"
def test_empty_data(self):
self._test_eval([(200, 250), (200, 249)])

View File

@@ -0,0 +1,30 @@
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
import unittest
from .common import (
get_config_files,
get_evolution_config_files,
get_quick_schedules_config_files,
setup,
)
class TestSetup(unittest.TestCase):
def _test_setup(self, config_file):
setup(config_file)
def test_setup_configs(self):
config_files = get_config_files()
for config_file in config_files:
self._test_setup(config_file)
def test_setup_evolution_configs(self):
config_files = get_evolution_config_files()
for config_file in config_files:
self._test_setup(config_file)
def test_setup_quick_schedules_configs(self):
config_files = get_quick_schedules_config_files()
for config_file in config_files:
self._test_setup(config_file)

View File

@@ -0,0 +1,25 @@
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
import unittest
from densepose.data.structures import normalized_coords_transform
class TestStructures(unittest.TestCase):
def test_normalized_coords_transform(self):
bbox = (32, 24, 288, 216)
x0, y0, w, h = bbox
xmin, ymin, xmax, ymax = x0, y0, x0 + w, y0 + h
f = normalized_coords_transform(*bbox)
# Top-left
expected_p, actual_p = (-1, -1), f((xmin, ymin))
self.assertEqual(expected_p, actual_p)
# Top-right
expected_p, actual_p = (1, -1), f((xmax, ymin))
self.assertEqual(expected_p, actual_p)
# Bottom-left
expected_p, actual_p = (-1, 1), f((xmin, ymax))
self.assertEqual(expected_p, actual_p)
# Bottom-right
expected_p, actual_p = (1, 1), f((xmax, ymax))
self.assertEqual(expected_p, actual_p)

View File

@@ -0,0 +1,122 @@
#!/usr/bin/env python3
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
"""
DensePose Training Script.
This script is similar to the training script in detectron2/tools.
It is an example of how a user might use detectron2 for a new project.
"""
import logging
import os
from collections import OrderedDict
from fvcore.common.file_io import PathManager
import detectron2.utils.comm as comm
from detectron2.checkpoint import DetectionCheckpointer
from detectron2.config import CfgNode, get_cfg
from detectron2.engine import DefaultTrainer, default_argument_parser, default_setup, hooks, launch
from detectron2.evaluation import COCOEvaluator, DatasetEvaluators, verify_results
from detectron2.modeling import DatasetMapperTTA
from detectron2.utils.logger import setup_logger
from densepose import (
DensePoseCOCOEvaluator,
DensePoseGeneralizedRCNNWithTTA,
add_dataset_category_config,
add_densepose_config,
load_from_cfg,
)
from densepose.data import DatasetMapper, build_detection_test_loader, build_detection_train_loader
class Trainer(DefaultTrainer):
@classmethod
def build_evaluator(cls, cfg: CfgNode, dataset_name, output_folder=None):
if output_folder is None:
output_folder = os.path.join(cfg.OUTPUT_DIR, "inference")
evaluators = [COCOEvaluator(dataset_name, cfg, True, output_folder)]
if cfg.MODEL.DENSEPOSE_ON:
evaluators.append(DensePoseCOCOEvaluator(dataset_name, True, output_folder))
return DatasetEvaluators(evaluators)
@classmethod
def build_test_loader(cls, cfg: CfgNode, dataset_name):
return build_detection_test_loader(cfg, dataset_name, mapper=DatasetMapper(cfg, False))
@classmethod
def build_train_loader(cls, cfg: CfgNode):
return build_detection_train_loader(cfg, mapper=DatasetMapper(cfg, True))
@classmethod
def test_with_TTA(cls, cfg: CfgNode, model):
logger = logging.getLogger("detectron2.trainer")
# In the end of training, run an evaluation with TTA
# Only support some R-CNN models.
logger.info("Running inference with test-time augmentation ...")
transform_data = load_from_cfg(cfg)
model = DensePoseGeneralizedRCNNWithTTA(cfg, model, transform_data, DatasetMapperTTA(cfg))
evaluators = [
cls.build_evaluator(
cfg, name, output_folder=os.path.join(cfg.OUTPUT_DIR, "inference_TTA")
)
for name in cfg.DATASETS.TEST
]
res = cls.test(cfg, model, evaluators)
res = OrderedDict({k + "_TTA": v for k, v in res.items()})
return res
def setup(args):
cfg = get_cfg()
add_dataset_category_config(cfg)
add_densepose_config(cfg)
cfg.merge_from_file(args.config_file)
cfg.merge_from_list(args.opts)
cfg.freeze()
default_setup(cfg, args)
# Setup logger for "densepose" module
setup_logger(output=cfg.OUTPUT_DIR, distributed_rank=comm.get_rank(), name="densepose")
return cfg
def main(args):
cfg = setup(args)
# disable strict kwargs checking: allow one to specify path handle
# hints through kwargs, like timeout in DP evaluation
PathManager.set_strict_kwargs_checking(False)
if args.eval_only:
model = Trainer.build_model(cfg)
DetectionCheckpointer(model, save_dir=cfg.OUTPUT_DIR).resume_or_load(
cfg.MODEL.WEIGHTS, resume=args.resume
)
res = Trainer.test(cfg, model)
if cfg.TEST.AUG.ENABLED:
res.update(Trainer.test_with_TTA(cfg, model))
if comm.is_main_process():
verify_results(cfg, res)
return res
trainer = Trainer(cfg)
trainer.resume_or_load(resume=args.resume)
if cfg.TEST.AUG.ENABLED:
trainer.register_hooks(
[hooks.EvalHook(0, lambda: trainer.test_with_TTA(cfg, trainer.model))]
)
return trainer.train()
if __name__ == "__main__":
args = default_argument_parser().parse_args()
print("Command Line Args:", args)
launch(
main,
args.num_gpus,
num_machines=args.num_machines,
machine_rank=args.machine_rank,
dist_url=args.dist_url,
args=(args,),
)

View File

@@ -0,0 +1,135 @@
# PointRend: Image Segmentation as Rendering
Alexander Kirillov, Yuxin Wu, Kaiming He, Ross Girshick
[[`arXiv`](https://arxiv.org/abs/1912.08193)] [[`BibTeX`](#CitingPointRend)]
<div align="center">
<img src="https://alexander-kirillov.github.io/images/kirillov2019pointrend.jpg"/>
</div><br/>
In this repository, we release code for PointRend in Detectron2. PointRend can be flexibly applied to both instance and semantic segmentation tasks by building on top of existing state-of-the-art models.
## Installation
Install Detectron 2 following [INSTALL.md](https://github.com/facebookresearch/detectron2/blob/master/INSTALL.md). You are ready to go!
## Quick start and visualization
This [Colab Notebook](https://colab.research.google.com/drive/1isGPL5h5_cKoPPhVL9XhMokRtHDvmMVL) tutorial contains examples of PointRend usage and visualizations of its point sampling stages.
## Training
To train a model with 8 GPUs run:
```bash
cd /path/to/detectron2/projects/PointRend
python train_net.py --config-file configs/InstanceSegmentation/pointrend_rcnn_R_50_FPN_1x_coco.yaml --num-gpus 8
```
## Evaluation
Model evaluation can be done similarly:
```bash
cd /path/to/detectron2/projects/PointRend
python train_net.py --config-file configs/InstanceSegmentation/pointrend_rcnn_R_50_FPN_1x_coco.yaml --eval-only MODEL.WEIGHTS /path/to/model_checkpoint
```
# Pretrained Models
## Instance Segmentation
#### COCO
<table><tbody>
<!-- START TABLE -->
<!-- TABLE HEADER -->
<th valign="bottom">Mask<br/>head</th>
<th valign="bottom">Backbone</th>
<th valign="bottom">lr<br/>sched</th>
<th valign="bottom">Output<br/>resolution</th>
<th valign="bottom">mask<br/>AP</th>
<th valign="bottom">mask<br/>AP&ast;</th>
<th valign="bottom">model id</th>
<th valign="bottom">download</th>
<!-- TABLE BODY -->
<tr><td align="left"><a href="configs/InstanceSegmentation/pointrend_rcnn_R_50_FPN_1x_coco.yaml">PointRend</a></td>
<td align="center">R50-FPN</td>
<td align="center">1&times;</td>
<td align="center">224&times;224</td>
<td align="center">36.2</td>
<td align="center">39.7</td>
<td align="center">164254221</td>
<td align="center"><a href="https://dl.fbaipublicfiles.com/detectron2/PointRend/InstanceSegmentation/pointrend_rcnn_R_50_FPN_1x_coco/164254221/model_final_88c6f8.pkl">model</a>&nbsp;|&nbsp;<a href="https://dl.fbaipublicfiles.com/detectron2/PointRend/InstanceSegmentation/pointrend_rcnn_R_50_FPN_1x_coco/164254221/metrics.json">metrics</a></td>
</tr>
<tr><td align="left"><a href="configs/InstanceSegmentation/pointrend_rcnn_R_50_FPN_3x_coco.yaml">PointRend</a></td>
<td align="center">R50-FPN</td>
<td align="center">3&times;</td>
<td align="center">224&times;224</td>
<td align="center">38.3</td>
<td align="center">41.6</td>
<td align="center">164955410</td>
<td align="center"><a href="https://dl.fbaipublicfiles.com/detectron2/PointRend/InstanceSegmentation/pointrend_rcnn_R_50_FPN_3x_coco/164955410/model_final_3c3198.pkl">model</a>&nbsp;|&nbsp;<a href="https://dl.fbaipublicfiles.com/detectron2/PointRend/InstanceSegmentation/pointrend_rcnn_R_50_FPN_3x_coco/164955410/metrics.json">metrics</a></td>
</tr>
</tbody></table>
AP&ast; is COCO mask AP evaluated against the higher-quality LVIS annotations; see the paper for details. Run `python detectron2/datasets/prepare_cocofied_lvis.py` to prepare GT files for AP&ast; evaluation. Since LVIS annotations are not exhaustive `lvis-api` and not `cocoapi` should be used to evaluate AP&ast;.
#### Cityscapes
Cityscapes model is trained with ImageNet pretraining.
<table><tbody>
<!-- START TABLE -->
<!-- TABLE HEADER -->
<th valign="bottom">Mask<br/>head</th>
<th valign="bottom">Backbone</th>
<th valign="bottom">lr<br/>sched</th>
<th valign="bottom">Output<br/>resolution</th>
<th valign="bottom">mask<br/>AP</th>
<th valign="bottom">model id</th>
<th valign="bottom">download</th>
<!-- TABLE BODY -->
<tr><td align="left"><a href="configs/InstanceSegmentation/pointrend_rcnn_R_50_FPN_1x_cityscapes.yaml">PointRend</a></td>
<td align="center">R50-FPN</td>
<td align="center">1&times;</td>
<td align="center">224&times;224</td>
<td align="center">35.9</td>
<td align="center">164255101</td>
<td align="center"><a href="https://dl.fbaipublicfiles.com/detectron2/PointRend/InstanceSegmentation/pointrend_rcnn_R_50_FPN_1x_cityscapes/164255101/model_final_318a02.pkl">model</a>&nbsp;|&nbsp;<a href="https://dl.fbaipublicfiles.com/detectron2/PointRend/InstanceSegmentation/pointrend_rcnn_R_50_FPN_1x_cityscapes/164255101/metrics.json">metrics</a></td>
</tr>
</tbody></table>
## Semantic Segmentation
#### Cityscapes
Cityscapes model is trained with ImageNet pretraining.
<table><tbody>
<!-- START TABLE -->
<!-- TABLE HEADER -->
<th valign="bottom">Method</th>
<th valign="bottom">Backbone</th>
<th valign="bottom">Output<br/>resolution</th>
<th valign="bottom">mIoU</th>
<th valign="bottom">model id</th>
<th valign="bottom">download</th>
<!-- TABLE BODY -->
<tr><td align="left"><a href="configs/SemanticSegmentation/pointrend_semantic_R_101_FPN_1x_cityscapes.yaml">SemanticFPN + PointRend</a></td>
<td align="center">R101-FPN</td>
<td align="center">1024&times;2048</td>
<td align="center">78.6</td>
<td align="center">186480235</td>
<td align="center"><a href="https://dl.fbaipublicfiles.com/detectron2/PointRend/SemanticSegmentation/pointrend_semantic_R_101_FPN_1x_cityscapes/186480235/model_final_5f3665.pkl">model</a>&nbsp;|&nbsp;<a href="https://dl.fbaipublicfiles.com/detectron2/PointRend/SemanticSegmentation/pointrend_semantic_R_101_FPN_1x_cityscapes/186480235/metrics.json">metrics</a></td>
</tr>
</tbody></table>
## <a name="CitingPointRend"></a>Citing PointRend
If you use PointRend, please use the following BibTeX entry.
```BibTeX
@InProceedings{kirillov2019pointrend,
title={{PointRend}: Image Segmentation as Rendering},
author={Alexander Kirillov and Yuxin Wu and Kaiming He and Ross Girshick},
journal={ArXiv:1912.08193},
year={2019}
}
```

View File

@@ -0,0 +1,21 @@
_BASE_: "../../../../configs/Base-RCNN-FPN.yaml"
MODEL:
ROI_HEADS:
NAME: "PointRendROIHeads"
IN_FEATURES: ["p2", "p3", "p4", "p5"]
ROI_BOX_HEAD:
TRAIN_ON_PRED_BOXES: True
ROI_MASK_HEAD:
NAME: "CoarseMaskHead"
FC_DIM: 1024
NUM_FC: 2
OUTPUT_SIDE_RESOLUTION: 7
IN_FEATURES: ["p2"]
POINT_HEAD_ON: True
POINT_HEAD:
FC_DIM: 256
NUM_FC: 3
IN_FEATURES: ["p2"]
INPUT:
# PointRend for instance segmenation does not work with "polygon" mask_format.
MASK_FORMAT: "bitmask"

View File

@@ -0,0 +1,23 @@
_BASE_: Base-PointRend-RCNN-FPN.yaml
MODEL:
WEIGHTS: detectron2://ImageNetPretrained/MSRA/R-50.pkl
MASK_ON: true
RESNETS:
DEPTH: 50
ROI_HEADS:
NUM_CLASSES: 8
POINT_HEAD:
NUM_CLASSES: 8
DATASETS:
TEST: ("cityscapes_fine_instance_seg_val",)
TRAIN: ("cityscapes_fine_instance_seg_train",)
SOLVER:
BASE_LR: 0.01
IMS_PER_BATCH: 8
MAX_ITER: 24000
STEPS: (18000,)
INPUT:
MAX_SIZE_TEST: 2048
MAX_SIZE_TRAIN: 2048
MIN_SIZE_TEST: 1024
MIN_SIZE_TRAIN: (800, 832, 864, 896, 928, 960, 992, 1024)

View File

@@ -0,0 +1,9 @@
_BASE_: Base-PointRend-RCNN-FPN.yaml
MODEL:
WEIGHTS: detectron2://ImageNetPretrained/MSRA/R-50.pkl
MASK_ON: true
RESNETS:
DEPTH: 50
# To add COCO AP evaluation against the higher-quality LVIS annotations.
# DATASETS:
# TEST: ("coco_2017_val", "lvis_v0.5_val_cocofied")

View File

@@ -0,0 +1,13 @@
_BASE_: Base-PointRend-RCNN-FPN.yaml
MODEL:
WEIGHTS: detectron2://ImageNetPretrained/MSRA/R-50.pkl
MASK_ON: true
RESNETS:
DEPTH: 50
SOLVER:
STEPS: (210000, 250000)
MAX_ITER: 270000
# To add COCO AP evaluation against the higher-quality LVIS annotations.
# DATASETS:
# TEST: ("coco_2017_val", "lvis_v0.5_val_cocofied")

View File

@@ -0,0 +1,20 @@
_BASE_: Base-PointRend-RCNN-FPN.yaml
MODEL:
WEIGHTS: detectron2://ImageNetPretrained/MSRA/R-50.pkl
MASK_ON: true
RESNETS:
DEPTH: 50
ROI_HEADS:
NUM_CLASSES: 1
POINT_HEAD:
NUM_CLASSES: 1
SOLVER:
STEPS: (210000, 250000)
MAX_ITER: 270000
IMS_PER_BATCH: 1
# To add COCO AP evaluation against the higher-quality LVIS annotations.
# DATASETS:
# TEST: ("coco_2017_val", "lvis_v0.5_val_cocofied")
DATASETS:
TRAIN: ("CIHP_train",)
TEST: ("CIHP_val",)

View File

@@ -0,0 +1,28 @@
_BASE_: Base-PointRend-RCNN-FPN.yaml
MODEL:
WEIGHTS: "./X-101-32x8d.pkl"
PIXEL_STD: [57.375, 57.120, 58.395]
MASK_ON: true
RESNETS:
STRIDE_IN_1X1: False # this is a C2 model
NUM_GROUPS: 32
WIDTH_PER_GROUP: 8
DEPTH: 101
ROI_HEADS:
NUM_CLASSES: 1
POINT_HEAD:
NUM_CLASSES: 1
SOLVER:
STEPS: (210000, 250000)
MAX_ITER: 270000
IMS_PER_BATCH: 1
# To add COCO AP evaluation against the higher-quality LVIS annotations.
# DATASETS:
# TEST: ("coco_2017_val", "lvis_v0.5_val_cocofied")
INPUT:
MIN_SIZE_TRAIN: (640, 864)
MIN_SIZE_TRAIN_SAMPLING: "range"
MAX_SIZE_TRAIN: 1440
DATASETS:
TRAIN: ("CIHP_train",)
TEST: ("CIHP_val",)

View File

@@ -0,0 +1,19 @@
_BASE_: "../../../../configs/Base-RCNN-FPN.yaml"
MODEL:
META_ARCHITECTURE: "SemanticSegmentor"
BACKBONE:
FREEZE_AT: 0
SEM_SEG_HEAD:
NAME: "PointRendSemSegHead"
POINT_HEAD:
NUM_CLASSES: 54
FC_DIM: 256
NUM_FC: 3
IN_FEATURES: ["p2"]
TRAIN_NUM_POINTS: 1024
SUBDIVISION_STEPS: 2
SUBDIVISION_NUM_POINTS: 8192
COARSE_SEM_SEG_HEAD_NAME: "SemSegFPNHead"
DATASETS:
TRAIN: ("coco_2017_train_panoptic_stuffonly",)
TEST: ("coco_2017_val_panoptic_stuffonly",)

View File

@@ -0,0 +1,33 @@
_BASE_: Base-PointRend-Semantic-FPN.yaml
MODEL:
WEIGHTS: detectron2://ImageNetPretrained/MSRA/R-101.pkl
RESNETS:
DEPTH: 101
SEM_SEG_HEAD:
NUM_CLASSES: 19
POINT_HEAD:
NUM_CLASSES: 19
TRAIN_NUM_POINTS: 2048
SUBDIVISION_NUM_POINTS: 8192
DATASETS:
TRAIN: ("cityscapes_fine_sem_seg_train",)
TEST: ("cityscapes_fine_sem_seg_val",)
SOLVER:
BASE_LR: 0.01
STEPS: (40000, 55000)
MAX_ITER: 65000
IMS_PER_BATCH: 32
INPUT:
MIN_SIZE_TRAIN: (512, 768, 1024, 1280, 1536, 1792, 2048)
MIN_SIZE_TRAIN_SAMPLING: "choice"
MIN_SIZE_TEST: 1024
MAX_SIZE_TRAIN: 4096
MAX_SIZE_TEST: 2048
CROP:
ENABLED: True
TYPE: "absolute"
SIZE: (512, 1024)
SINGLE_CATEGORY_MAX_AREA: 0.75
COLOR_AUG_SSD: True
DATALOADER:
NUM_WORKERS: 16

View File

@@ -0,0 +1,5 @@
_BASE_: Base-PointRend-Semantic-FPN.yaml
MODEL:
WEIGHTS: detectron2://ImageNetPretrained/MSRA/R-50.pkl
RESNETS:
DEPTH: 50

View File

@@ -0,0 +1,139 @@
#!/usr/bin/env python3
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
"""
PointRend Training Script.
This script is a simplified version of the training script in detectron2/tools.
"""
import os
import torch
import detectron2.utils.comm as comm
from detectron2.checkpoint import DetectionCheckpointer
from detectron2.config import get_cfg
from detectron2.data import MetadataCatalog, build_detection_train_loader
from detectron2.engine import DefaultTrainer, default_argument_parser, default_setup, launch
from detectron2.evaluation import (
CityscapesInstanceEvaluator,
CityscapesSemSegEvaluator,
COCOEvaluator,
DatasetEvaluators,
LVISEvaluator,
SemSegEvaluator,
verify_results,
)
from point_rend import SemSegDatasetMapper, add_pointrend_config
os.environ['CUDA_VISIBLE_DEVICES'] = '4'
# Register Custom Dataset
from detectron2.data.datasets import register_coco_instances
register_coco_instances("CIHP_train", {}, "/data03/v_xuyunqiu/multi_parsing/data/msrcnn_finetune_annotations/CIHP_train.json", "/data03/v_xuyunqiu/data/instance-level_human_parsing/Training/Images")
register_coco_instances("CIHP_val", {}, "/data03/v_xuyunqiu/multi_parsing/data/msrcnn_finetune_annotations/CIHP_val.json", "/data03/v_xuyunqiu/data/instance-level_human_parsing/Validation/Images")
class Trainer(DefaultTrainer):
"""
We use the "DefaultTrainer" which contains a number pre-defined logic for
standard training workflow. They may not work for you, especially if you
are working on a new research project. In that case you can use the cleaner
"SimpleTrainer", or write your own training loop.
"""
@classmethod
def build_evaluator(cls, cfg, dataset_name, output_folder=None):
"""
Create evaluator(s) for a given dataset.
This uses the special metadata "evaluator_type" associated with each builtin dataset.
For your own dataset, you can simply create an evaluator manually in your
script and do not have to worry about the hacky if-else logic here.
"""
if output_folder is None:
output_folder = os.path.join(cfg.OUTPUT_DIR, "inference")
evaluator_list = []
evaluator_type = MetadataCatalog.get(dataset_name).evaluator_type
if evaluator_type == "lvis":
return LVISEvaluator(dataset_name, cfg, True, output_folder)
if evaluator_type == "coco":
return COCOEvaluator(dataset_name, cfg, True, output_folder)
if evaluator_type == "sem_seg":
return SemSegEvaluator(
dataset_name,
distributed=True,
num_classes=cfg.MODEL.SEM_SEG_HEAD.NUM_CLASSES,
ignore_label=cfg.MODEL.SEM_SEG_HEAD.IGNORE_VALUE,
output_dir=output_folder,
)
if evaluator_type == "cityscapes_instance":
assert (
torch.cuda.device_count() >= comm.get_rank()
), "CityscapesEvaluator currently do not work with multiple machines."
return CityscapesInstanceEvaluator(dataset_name)
if evaluator_type == "cityscapes_sem_seg":
assert (
torch.cuda.device_count() >= comm.get_rank()
), "CityscapesEvaluator currently do not work with multiple machines."
return CityscapesSemSegEvaluator(dataset_name)
if len(evaluator_list) == 0:
raise NotImplementedError(
"no Evaluator for the dataset {} with the type {}".format(
dataset_name, evaluator_type
)
)
if len(evaluator_list) == 1:
return evaluator_list[0]
return DatasetEvaluators(evaluator_list)
@classmethod
def build_train_loader(cls, cfg):
if "SemanticSegmentor" in cfg.MODEL.META_ARCHITECTURE:
mapper = SemSegDatasetMapper(cfg, True)
else:
mapper = None
return build_detection_train_loader(cfg, mapper=mapper)
def setup(args):
"""
Create configs and perform basic setups.
"""
cfg = get_cfg()
add_pointrend_config(cfg)
cfg.merge_from_file(args.config_file)
cfg.merge_from_list(args.opts)
cfg.freeze()
default_setup(cfg, args)
return cfg
def main(args):
cfg = setup(args)
if args.eval_only:
model = Trainer.build_model(cfg)
DetectionCheckpointer(model, save_dir=cfg.OUTPUT_DIR).resume_or_load(
cfg.MODEL.WEIGHTS, resume=args.resume
)
res = Trainer.test(cfg, model)
if comm.is_main_process():
verify_results(cfg, res)
return res
trainer = Trainer(cfg)
trainer.resume_or_load(resume=args.resume)
return trainer.train()
if __name__ == "__main__":
args = default_argument_parser().parse_args()
print("Command Line Args:", args)
launch(
main,
args.num_gpus,
num_machines=args.num_machines,
machine_rank=args.machine_rank,
dist_url=args.dist_url,
args=(args,),
)

View File

@@ -0,0 +1,6 @@
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
from .config import add_pointrend_config
from .coarse_mask_head import CoarseMaskHead
from .roi_heads import PointRendROIHeads
from .dataset_mapper import SemSegDatasetMapper
from .semantic_seg import PointRendSemSegHead

View File

@@ -0,0 +1,92 @@
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
import fvcore.nn.weight_init as weight_init
import torch
from torch import nn
from torch.nn import functional as F
from detectron2.layers import Conv2d, ShapeSpec
from detectron2.modeling import ROI_MASK_HEAD_REGISTRY
@ROI_MASK_HEAD_REGISTRY.register()
class CoarseMaskHead(nn.Module):
"""
A mask head with fully connected layers. Given pooled features it first reduces channels and
spatial dimensions with conv layers and then uses FC layers to predict coarse masks analogously
to the standard box head.
"""
def __init__(self, cfg, input_shape: ShapeSpec):
"""
The following attributes are parsed from config:
conv_dim: the output dimension of the conv layers
fc_dim: the feature dimenstion of the FC layers
num_fc: the number of FC layers
output_side_resolution: side resolution of the output square mask prediction
"""
super(CoarseMaskHead, self).__init__()
# fmt: off
self.num_classes = cfg.MODEL.ROI_HEADS.NUM_CLASSES
conv_dim = cfg.MODEL.ROI_MASK_HEAD.CONV_DIM
self.fc_dim = cfg.MODEL.ROI_MASK_HEAD.FC_DIM
num_fc = cfg.MODEL.ROI_MASK_HEAD.NUM_FC
self.output_side_resolution = cfg.MODEL.ROI_MASK_HEAD.OUTPUT_SIDE_RESOLUTION
self.input_channels = input_shape.channels
self.input_h = input_shape.height
self.input_w = input_shape.width
# fmt: on
self.conv_layers = []
if self.input_channels > conv_dim:
self.reduce_channel_dim_conv = Conv2d(
self.input_channels,
conv_dim,
kernel_size=1,
stride=1,
padding=0,
bias=True,
activation=F.relu,
)
self.conv_layers.append(self.reduce_channel_dim_conv)
self.reduce_spatial_dim_conv = Conv2d(
conv_dim, conv_dim, kernel_size=2, stride=2, padding=0, bias=True, activation=F.relu
)
self.conv_layers.append(self.reduce_spatial_dim_conv)
input_dim = conv_dim * self.input_h * self.input_w
input_dim //= 4
self.fcs = []
for k in range(num_fc):
fc = nn.Linear(input_dim, self.fc_dim)
self.add_module("coarse_mask_fc{}".format(k + 1), fc)
self.fcs.append(fc)
input_dim = self.fc_dim
output_dim = self.num_classes * self.output_side_resolution * self.output_side_resolution
self.prediction = nn.Linear(self.fc_dim, output_dim)
# use normal distribution initialization for mask prediction layer
nn.init.normal_(self.prediction.weight, std=0.001)
nn.init.constant_(self.prediction.bias, 0)
for layer in self.conv_layers:
weight_init.c2_msra_fill(layer)
for layer in self.fcs:
weight_init.c2_xavier_fill(layer)
def forward(self, x):
# unlike BaseMaskRCNNHead, this head only outputs intermediate
# features, because the features will be used later by PointHead.
N = x.shape[0]
x = x.view(N, self.input_channels, self.input_h, self.input_w)
for layer in self.conv_layers:
x = layer(x)
x = torch.flatten(x, start_dim=1)
for layer in self.fcs:
x = F.relu(layer(x))
return self.prediction(x).view(
N, self.num_classes, self.output_side_resolution, self.output_side_resolution
)

View File

@@ -0,0 +1,98 @@
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
import numpy as np
import random
import cv2
from fvcore.transforms.transform import Transform
class ColorAugSSDTransform(Transform):
"""
A color related data augmentation used in Single Shot Multibox Detector (SSD).
Wei Liu, Dragomir Anguelov, Dumitru Erhan, Christian Szegedy,
Scott Reed, Cheng-Yang Fu, Alexander C. Berg.
SSD: Single Shot MultiBox Detector. ECCV 2016.
Implementation based on:
https://github.com/weiliu89/caffe/blob
/4817bf8b4200b35ada8ed0dc378dceaf38c539e4
/src/caffe/util/im_transforms.cpp
https://github.com/chainer/chainercv/blob
/7159616642e0be7c5b3ef380b848e16b7e99355b/chainercv
/links/model/ssd/transforms.py
"""
def __init__(
self,
img_format,
brightness_delta=32,
contrast_low=0.5,
contrast_high=1.5,
saturation_low=0.5,
saturation_high=1.5,
hue_delta=18,
):
super().__init__()
assert img_format in ["BGR", "RGB"]
self.is_rgb = img_format == "RGB"
del img_format
self._set_attributes(locals())
def apply_coords(self, coords):
return coords
def apply_segmentation(self, segmentation):
return segmentation
def apply_image(self, img, interp=None):
if self.is_rgb:
img = img[:, :, [2, 1, 0]]
img = self.brightness(img)
if random.randrange(2):
img = self.contrast(img)
img = self.saturation(img)
img = self.hue(img)
else:
img = self.saturation(img)
img = self.hue(img)
img = self.contrast(img)
if self.is_rgb:
img = img[:, :, [2, 1, 0]]
return img
def convert(self, img, alpha=1, beta=0):
img = img.astype(np.float32) * alpha + beta
img = np.clip(img, 0, 255)
return img.astype(np.uint8)
def brightness(self, img):
if random.randrange(2):
return self.convert(
img, beta=random.uniform(-self.brightness_delta, self.brightness_delta)
)
return img
def contrast(self, img):
if random.randrange(2):
return self.convert(img, alpha=random.uniform(self.contrast_low, self.contrast_high))
return img
def saturation(self, img):
if random.randrange(2):
img = cv2.cvtColor(img, cv2.COLOR_BGR2HSV)
img[:, :, 1] = self.convert(
img[:, :, 1], alpha=random.uniform(self.saturation_low, self.saturation_high)
)
return cv2.cvtColor(img, cv2.COLOR_HSV2BGR)
return img
def hue(self, img):
if random.randrange(2):
img = cv2.cvtColor(img, cv2.COLOR_BGR2HSV)
img[:, :, 0] = (
img[:, :, 0].astype(int) + random.randint(-self.hue_delta, self.hue_delta)
) % 180
return cv2.cvtColor(img, cv2.COLOR_HSV2BGR)
return img

View File

@@ -0,0 +1,48 @@
# -*- coding: utf-8 -*-
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
from detectron2.config import CfgNode as CN
def add_pointrend_config(cfg):
"""
Add config for PointRend.
"""
# We retry random cropping until no single category in semantic segmentation GT occupies more
# than `SINGLE_CATEGORY_MAX_AREA` part of the crop.
cfg.INPUT.CROP.SINGLE_CATEGORY_MAX_AREA = 1.0
# Color augmentatition from SSD paper for semantic segmentation model during training.
cfg.INPUT.COLOR_AUG_SSD = False
# Names of the input feature maps to be used by a coarse mask head.
cfg.MODEL.ROI_MASK_HEAD.IN_FEATURES = ("p2",)
cfg.MODEL.ROI_MASK_HEAD.FC_DIM = 1024
cfg.MODEL.ROI_MASK_HEAD.NUM_FC = 2
# The side size of a coarse mask head prediction.
cfg.MODEL.ROI_MASK_HEAD.OUTPUT_SIDE_RESOLUTION = 7
# True if point head is used.
cfg.MODEL.ROI_MASK_HEAD.POINT_HEAD_ON = False
cfg.MODEL.POINT_HEAD = CN()
cfg.MODEL.POINT_HEAD.NAME = "StandardPointHead"
cfg.MODEL.POINT_HEAD.NUM_CLASSES = 80
# Names of the input feature maps to be used by a mask point head.
cfg.MODEL.POINT_HEAD.IN_FEATURES = ("p2",)
# Number of points sampled during training for a mask point head.
cfg.MODEL.POINT_HEAD.TRAIN_NUM_POINTS = 14 * 14
# Oversampling parameter for PointRend point sampling during training. Parameter `k` in the
# original paper.
cfg.MODEL.POINT_HEAD.OVERSAMPLE_RATIO = 3
# Importance sampling parameter for PointRend point sampling during training. Parametr `beta` in
# the original paper.
cfg.MODEL.POINT_HEAD.IMPORTANCE_SAMPLE_RATIO = 0.75
# Number of subdivision steps during inference.
cfg.MODEL.POINT_HEAD.SUBDIVISION_STEPS = 5
# Maximum number of points selected at each subdivision step (N).
cfg.MODEL.POINT_HEAD.SUBDIVISION_NUM_POINTS = 28 * 28
cfg.MODEL.POINT_HEAD.FC_DIM = 256
cfg.MODEL.POINT_HEAD.NUM_FC = 3
cfg.MODEL.POINT_HEAD.CLS_AGNOSTIC_MASK = False
# If True, then coarse prediction features are used as inout for each layer in PointRend's MLP.
cfg.MODEL.POINT_HEAD.COARSE_PRED_EACH_LAYER = True
cfg.MODEL.POINT_HEAD.COARSE_SEM_SEG_HEAD_NAME = "SemSegFPNHead"

View File

@@ -0,0 +1,121 @@
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
import copy
import logging
import numpy as np
import torch
from fvcore.common.file_io import PathManager
from fvcore.transforms.transform import CropTransform
from PIL import Image
from detectron2.data import detection_utils as utils
from detectron2.data import transforms as T
from .color_augmentation import ColorAugSSDTransform
"""
This file contains the mapping that's applied to "dataset dicts" for semantic segmentation models.
Unlike the default DatasetMapper this mapper uses cropping as the last transformation.
"""
__all__ = ["SemSegDatasetMapper"]
class SemSegDatasetMapper:
"""
A callable which takes a dataset dict in Detectron2 Dataset format,
and map it into a format used by semantic segmentation models.
The callable currently does the following:
1. Read the image from "file_name"
2. Applies geometric transforms to the image and annotation
3. Find and applies suitable cropping to the image and annotation
4. Prepare image and annotation to Tensors
"""
def __init__(self, cfg, is_train=True):
if cfg.INPUT.CROP.ENABLED and is_train:
self.crop_gen = T.RandomCrop(cfg.INPUT.CROP.TYPE, cfg.INPUT.CROP.SIZE)
logging.getLogger(__name__).info("CropGen used in training: " + str(self.crop_gen))
else:
self.crop_gen = None
self.tfm_gens = utils.build_transform_gen(cfg, is_train)
if cfg.INPUT.COLOR_AUG_SSD:
self.tfm_gens.append(ColorAugSSDTransform(img_format=cfg.INPUT.FORMAT))
logging.getLogger(__name__).info(
"Color augmnetation used in training: " + str(self.tfm_gens[-1])
)
# fmt: off
self.img_format = cfg.INPUT.FORMAT
self.single_category_max_area = cfg.INPUT.CROP.SINGLE_CATEGORY_MAX_AREA
self.ignore_value = cfg.MODEL.SEM_SEG_HEAD.IGNORE_VALUE
# fmt: on
self.is_train = is_train
def __call__(self, dataset_dict):
"""
Args:
dataset_dict (dict): Metadata of one image, in Detectron2 Dataset format.
Returns:
dict: a format that builtin models in detectron2 accept
"""
dataset_dict = copy.deepcopy(dataset_dict) # it will be modified by code below
image = utils.read_image(dataset_dict["file_name"], format=self.img_format)
utils.check_image_size(dataset_dict, image)
assert "sem_seg_file_name" in dataset_dict
image, transforms = T.apply_transform_gens(self.tfm_gens, image)
if self.is_train:
with PathManager.open(dataset_dict.pop("sem_seg_file_name"), "rb") as f:
sem_seg_gt = Image.open(f)
sem_seg_gt = np.asarray(sem_seg_gt, dtype="uint8")
sem_seg_gt = transforms.apply_segmentation(sem_seg_gt)
if self.crop_gen:
image, sem_seg_gt = crop_transform(
image,
sem_seg_gt,
self.crop_gen,
self.single_category_max_area,
self.ignore_value,
)
dataset_dict["sem_seg"] = torch.as_tensor(sem_seg_gt.astype("long"))
# Pytorch's dataloader is efficient on torch.Tensor due to shared-memory,
# but not efficient on large generic data structures due to the use of pickle & mp.Queue.
# Therefore it's important to use torch.Tensor.
dataset_dict["image"] = torch.as_tensor(np.ascontiguousarray(image.transpose(2, 0, 1)))
if not self.is_train:
dataset_dict.pop("sem_seg_file_name", None)
return dataset_dict
return dataset_dict
def crop_transform(image, sem_seg, crop_gen, single_category_max_area, ignore_value):
"""
Find a cropping window such that no single category occupies more than
`single_category_max_area` in `sem_seg`. The function retries random cropping 10 times max.
"""
if single_category_max_area >= 1.0:
crop_tfm = crop_gen.get_transform(image)
sem_seg_temp = crop_tfm.apply_segmentation(sem_seg)
else:
h, w = sem_seg.shape
crop_size = crop_gen.get_crop_size((h, w))
for _ in range(10):
y0 = np.random.randint(h - crop_size[0] + 1)
x0 = np.random.randint(w - crop_size[1] + 1)
sem_seg_temp = sem_seg[y0 : y0 + crop_size[0], x0 : x0 + crop_size[1]]
labels, cnt = np.unique(sem_seg_temp, return_counts=True)
cnt = cnt[labels != ignore_value]
if len(cnt) > 1 and np.max(cnt) / np.sum(cnt) < single_category_max_area:
break
crop_tfm = CropTransform(x0, y0, crop_size[1], crop_size[0])
image = crop_tfm.apply_image(image)
return image, sem_seg_temp

View File

@@ -0,0 +1,216 @@
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
import torch
from torch.nn import functional as F
from detectron2.layers import cat
from detectron2.structures import Boxes
"""
Shape shorthand in this module:
N: minibatch dimension size, i.e. the number of RoIs for instance segmenation or the
number of images for semantic segmenation.
R: number of ROIs, combined over all images, in the minibatch
P: number of points
"""
def point_sample(input, point_coords, **kwargs):
"""
A wrapper around :function:`torch.nn.functional.grid_sample` to support 3D point_coords tensors.
Unlike :function:`torch.nn.functional.grid_sample` it assumes `point_coords` to lie inside
[0, 1] x [0, 1] square.
Args:
input (Tensor): A tensor of shape (N, C, H, W) that contains features map on a H x W grid.
point_coords (Tensor): A tensor of shape (N, P, 2) or (N, Hgrid, Wgrid, 2) that contains
[0, 1] x [0, 1] normalized point coordinates.
Returns:
output (Tensor): A tensor of shape (N, C, P) or (N, C, Hgrid, Wgrid) that contains
features for points in `point_coords`. The features are obtained via bilinear
interplation from `input` the same way as :function:`torch.nn.functional.grid_sample`.
"""
add_dim = False
if point_coords.dim() == 3:
add_dim = True
point_coords = point_coords.unsqueeze(2)
output = F.grid_sample(input, 2.0 * point_coords - 1.0, **kwargs)
if add_dim:
output = output.squeeze(3)
return output
def generate_regular_grid_point_coords(R, side_size, device):
"""
Generate regular square grid of points in [0, 1] x [0, 1] coordinate space.
Args:
R (int): The number of grids to sample, one for each region.
side_size (int): The side size of the regular grid.
device (torch.device): Desired device of returned tensor.
Returns:
(Tensor): A tensor of shape (R, side_size^2, 2) that contains coordinates
for the regular grids.
"""
aff = torch.tensor([[[0.5, 0, 0.5], [0, 0.5, 0.5]]], device=device)
r = F.affine_grid(aff, torch.Size((1, 1, side_size, side_size)), align_corners=False)
return r.view(1, -1, 2).expand(R, -1, -1)
def get_uncertain_point_coords_with_randomness(
coarse_logits, uncertainty_func, num_points, oversample_ratio, importance_sample_ratio
):
"""
Sample points in [0, 1] x [0, 1] coordinate space based on their uncertainty. The unceratinties
are calculated for each point using 'uncertainty_func' function that takes point's logit
prediction as input.
See PointRend paper for details.
Args:
coarse_logits (Tensor): A tensor of shape (N, C, Hmask, Wmask) or (N, 1, Hmask, Wmask) for
class-specific or class-agnostic prediction.
uncertainty_func: A function that takes a Tensor of shape (N, C, P) or (N, 1, P) that
contains logit predictions for P points and returns their uncertainties as a Tensor of
shape (N, 1, P).
num_points (int): The number of points P to sample.
oversample_ratio (int): Oversampling parameter.
importance_sample_ratio (float): Ratio of points that are sampled via importnace sampling.
Returns:
point_coords (Tensor): A tensor of shape (N, P, 2) that contains the coordinates of P
sampled points.
"""
assert oversample_ratio >= 1
assert importance_sample_ratio <= 1 and importance_sample_ratio >= 0
num_boxes = coarse_logits.shape[0]
num_sampled = int(num_points * oversample_ratio)
point_coords = torch.rand(num_boxes, num_sampled, 2, device=coarse_logits.device)
point_logits = point_sample(coarse_logits, point_coords, align_corners=False)
# It is crucial to calculate uncertainty based on the sampled prediction value for the points.
# Calculating uncertainties of the coarse predictions first and sampling them for points leads
# to incorrect results.
# To illustrate this: assume uncertainty_func(logits)=-abs(logits), a sampled point between
# two coarse predictions with -1 and 1 logits has 0 logits, and therefore 0 uncertainty value.
# However, if we calculate uncertainties for the coarse predictions first,
# both will have -1 uncertainty, and the sampled point will get -1 uncertainty.
point_uncertainties = uncertainty_func(point_logits)
num_uncertain_points = int(importance_sample_ratio * num_points)
num_random_points = num_points - num_uncertain_points
idx = torch.topk(point_uncertainties[:, 0, :], k=num_uncertain_points, dim=1)[1]
shift = num_sampled * torch.arange(num_boxes, dtype=torch.long, device=coarse_logits.device)
idx += shift[:, None]
point_coords = point_coords.view(-1, 2)[idx.view(-1), :].view(
num_boxes, num_uncertain_points, 2
)
if num_random_points > 0:
point_coords = cat(
[
point_coords,
torch.rand(num_boxes, num_random_points, 2, device=coarse_logits.device),
],
dim=1,
)
return point_coords
def get_uncertain_point_coords_on_grid(uncertainty_map, num_points):
"""
Find `num_points` most uncertain points from `uncertainty_map` grid.
Args:
uncertainty_map (Tensor): A tensor of shape (N, 1, H, W) that contains uncertainty
values for a set of points on a regular H x W grid.
num_points (int): The number of points P to select.
Returns:
point_indices (Tensor): A tensor of shape (N, P) that contains indices from
[0, H x W) of the most uncertain points.
point_coords (Tensor): A tensor of shape (N, P, 2) that contains [0, 1] x [0, 1] normalized
coordinates of the most uncertain points from the H x W grid.
"""
R, _, H, W = uncertainty_map.shape
h_step = 1.0 / float(H)
w_step = 1.0 / float(W)
num_points = min(H * W, num_points)
point_indices = torch.topk(uncertainty_map.view(R, H * W), k=num_points, dim=1)[1]
point_coords = torch.zeros(R, num_points, 2, dtype=torch.float, device=uncertainty_map.device)
point_coords[:, :, 0] = w_step / 2.0 + (point_indices % W).to(torch.float) * w_step
point_coords[:, :, 1] = h_step / 2.0 + (point_indices // W).to(torch.float) * h_step
return point_indices, point_coords
def point_sample_fine_grained_features(features_list, feature_scales, boxes, point_coords):
"""
Get features from feature maps in `features_list` that correspond to specific point coordinates
inside each bounding box from `boxes`.
Args:
features_list (list[Tensor]): A list of feature map tensors to get features from.
feature_scales (list[float]): A list of scales for tensors in `features_list`.
boxes (list[Boxes]): A list of I Boxes objects that contain R_1 + ... + R_I = R boxes all
together.
point_coords (Tensor): A tensor of shape (R, P, 2) that contains
[0, 1] x [0, 1] box-normalized coordinates of the P sampled points.
Returns:
point_features (Tensor): A tensor of shape (R, C, P) that contains features sampled
from all features maps in feature_list for P sampled points for all R boxes in `boxes`.
point_coords_wrt_image (Tensor): A tensor of shape (R, P, 2) that contains image-level
coordinates of P points.
"""
cat_boxes = Boxes.cat(boxes)
num_boxes = [len(b) for b in boxes]
point_coords_wrt_image = get_point_coords_wrt_image(cat_boxes.tensor, point_coords)
split_point_coords_wrt_image = torch.split(point_coords_wrt_image, num_boxes)
point_features = []
for idx_img, point_coords_wrt_image_per_image in enumerate(split_point_coords_wrt_image):
point_features_per_image = []
for idx_feature, feature_map in enumerate(features_list):
h, w = feature_map.shape[-2:]
scale = torch.tensor([w, h], device=feature_map.device) / feature_scales[idx_feature]
point_coords_scaled = point_coords_wrt_image_per_image / scale
point_features_per_image.append(
point_sample(
feature_map[idx_img].unsqueeze(0),
point_coords_scaled.unsqueeze(0),
align_corners=False,
)
.squeeze(0)
.transpose(1, 0)
)
point_features.append(cat(point_features_per_image, dim=1))
return cat(point_features, dim=0), point_coords_wrt_image
def get_point_coords_wrt_image(boxes_coords, point_coords):
"""
Convert box-normalized [0, 1] x [0, 1] point cooordinates to image-level coordinates.
Args:
boxes_coords (Tensor): A tensor of shape (R, 4) that contains bounding boxes.
coordinates.
point_coords (Tensor): A tensor of shape (R, P, 2) that contains
[0, 1] x [0, 1] box-normalized coordinates of the P sampled points.
Returns:
point_coords_wrt_image (Tensor): A tensor of shape (R, P, 2) that contains
image-normalized coordinates of P sampled points.
"""
with torch.no_grad():
point_coords_wrt_image = point_coords.clone()
point_coords_wrt_image[:, :, 0] = point_coords_wrt_image[:, :, 0] * (
boxes_coords[:, None, 2] - boxes_coords[:, None, 0]
)
point_coords_wrt_image[:, :, 1] = point_coords_wrt_image[:, :, 1] * (
boxes_coords[:, None, 3] - boxes_coords[:, None, 1]
)
point_coords_wrt_image[:, :, 0] += boxes_coords[:, None, 0]
point_coords_wrt_image[:, :, 1] += boxes_coords[:, None, 1]
return point_coords_wrt_image

View File

@@ -0,0 +1,154 @@
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
import fvcore.nn.weight_init as weight_init
import torch
from torch import nn
from torch.nn import functional as F
from detectron2.layers import ShapeSpec, cat
from detectron2.structures import BitMasks
from detectron2.utils.events import get_event_storage
from detectron2.utils.registry import Registry
from .point_features import point_sample
POINT_HEAD_REGISTRY = Registry("POINT_HEAD")
POINT_HEAD_REGISTRY.__doc__ = """
Registry for point heads, which makes prediction for a given set of per-point features.
The registered object will be called with `obj(cfg, input_shape)`.
"""
def roi_mask_point_loss(mask_logits, instances, points_coord):
"""
Compute the point-based loss for instance segmentation mask predictions.
Args:
mask_logits (Tensor): A tensor of shape (R, C, P) or (R, 1, P) for class-specific or
class-agnostic, where R is the total number of predicted masks in all images, C is the
number of foreground classes, and P is the number of points sampled for each mask.
The values are logits.
instances (list[Instances]): A list of N Instances, where N is the number of images
in the batch. These instances are in 1:1 correspondence with the `mask_logits`. So, i_th
elememt of the list contains R_i objects and R_1 + ... + R_N is equal to R.
The ground-truth labels (class, box, mask, ...) associated with each instance are stored
in fields.
points_coords (Tensor): A tensor of shape (R, P, 2), where R is the total number of
predicted masks and P is the number of points for each mask. The coordinates are in
the image pixel coordinate space, i.e. [0, H] x [0, W].
Returns:
point_loss (Tensor): A scalar tensor containing the loss.
"""
assert len(instances) == 0 or isinstance(
instances[0].gt_masks, BitMasks
), "Point head works with GT in 'bitmask' format only. Set INPUT.MASK_FORMAT to 'bitmask'."
with torch.no_grad():
cls_agnostic_mask = mask_logits.size(1) == 1
total_num_masks = mask_logits.size(0)
gt_classes = []
gt_mask_logits = []
idx = 0
for instances_per_image in instances:
if not cls_agnostic_mask:
gt_classes_per_image = instances_per_image.gt_classes.to(dtype=torch.int64)
gt_classes.append(gt_classes_per_image)
gt_bit_masks = instances_per_image.gt_masks.tensor
h, w = instances_per_image.gt_masks.image_size
scale = torch.tensor([w, h], dtype=torch.float, device=gt_bit_masks.device)
points_coord_grid_sample_format = (
points_coord[idx : idx + len(instances_per_image)] / scale
)
idx += len(instances_per_image)
gt_mask_logits.append(
point_sample(
gt_bit_masks.to(torch.float32).unsqueeze(1),
points_coord_grid_sample_format,
align_corners=False,
).squeeze(1)
)
gt_mask_logits = cat(gt_mask_logits)
# torch.mean (in binary_cross_entropy_with_logits) doesn't
# accept empty tensors, so handle it separately
if gt_mask_logits.numel() == 0:
return mask_logits.sum() * 0
if cls_agnostic_mask:
mask_logits = mask_logits[:, 0]
else:
indices = torch.arange(total_num_masks)
gt_classes = cat(gt_classes, dim=0)
mask_logits = mask_logits[indices, gt_classes]
# Log the training accuracy (using gt classes and 0.0 threshold for the logits)
mask_accurate = (mask_logits > 0.0) == gt_mask_logits.to(dtype=torch.uint8)
mask_accuracy = mask_accurate.nonzero().size(0) / mask_accurate.numel()
get_event_storage().put_scalar("point_rend/accuracy", mask_accuracy)
point_loss = F.binary_cross_entropy_with_logits(
mask_logits, gt_mask_logits.to(dtype=torch.float32), reduction="mean"
)
return point_loss
@POINT_HEAD_REGISTRY.register()
class StandardPointHead(nn.Module):
"""
A point head multi-layer perceptron which we model with conv1d layers with kernel 1. The head
takes both fine-grained and coarse prediction features as its input.
"""
def __init__(self, cfg, input_shape: ShapeSpec):
"""
The following attributes are parsed from config:
fc_dim: the output dimension of each FC layers
num_fc: the number of FC layers
coarse_pred_each_layer: if True, coarse prediction features are concatenated to each
layer's input
"""
super(StandardPointHead, self).__init__()
# fmt: off
num_classes = cfg.MODEL.POINT_HEAD.NUM_CLASSES
fc_dim = cfg.MODEL.POINT_HEAD.FC_DIM
num_fc = cfg.MODEL.POINT_HEAD.NUM_FC
cls_agnostic_mask = cfg.MODEL.POINT_HEAD.CLS_AGNOSTIC_MASK
self.coarse_pred_each_layer = cfg.MODEL.POINT_HEAD.COARSE_PRED_EACH_LAYER
input_channels = input_shape.channels
# fmt: on
fc_dim_in = input_channels + num_classes
self.fc_layers = []
for k in range(num_fc):
fc = nn.Conv1d(fc_dim_in, fc_dim, kernel_size=1, stride=1, padding=0, bias=True)
self.add_module("fc{}".format(k + 1), fc)
self.fc_layers.append(fc)
fc_dim_in = fc_dim
fc_dim_in += num_classes if self.coarse_pred_each_layer else 0
num_mask_classes = 1 if cls_agnostic_mask else num_classes
self.predictor = nn.Conv1d(fc_dim_in, num_mask_classes, kernel_size=1, stride=1, padding=0)
for layer in self.fc_layers:
weight_init.c2_msra_fill(layer)
# use normal distribution initialization for mask prediction layer
nn.init.normal_(self.predictor.weight, std=0.001)
if self.predictor.bias is not None:
nn.init.constant_(self.predictor.bias, 0)
def forward(self, fine_grained_features, coarse_features):
x = torch.cat((fine_grained_features, coarse_features), dim=1)
for layer in self.fc_layers:
x = F.relu(layer(x))
if self.coarse_pred_each_layer:
x = cat((x, coarse_features), dim=1)
return self.predictor(x)
def build_point_head(cfg, input_channels):
"""
Build a point head defined by `cfg.MODEL.POINT_HEAD.NAME`.
"""
head_name = cfg.MODEL.POINT_HEAD.NAME
return POINT_HEAD_REGISTRY.get(head_name)(cfg, input_channels)

View File

@@ -0,0 +1,227 @@
# -*- coding: utf-8 -*-
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
import numpy as np
import torch
from detectron2.layers import ShapeSpec, cat, interpolate
from detectron2.modeling import ROI_HEADS_REGISTRY, StandardROIHeads
from detectron2.modeling.roi_heads.mask_head import (
build_mask_head,
mask_rcnn_inference,
mask_rcnn_loss,
)
from detectron2.modeling.roi_heads.roi_heads import select_foreground_proposals
from .point_features import (
generate_regular_grid_point_coords,
get_uncertain_point_coords_on_grid,
get_uncertain_point_coords_with_randomness,
point_sample,
point_sample_fine_grained_features,
)
from .point_head import build_point_head, roi_mask_point_loss
def calculate_uncertainty(logits, classes):
"""
We estimate uncerainty as L1 distance between 0.0 and the logit prediction in 'logits' for the
foreground class in `classes`.
Args:
logits (Tensor): A tensor of shape (R, C, ...) or (R, 1, ...) for class-specific or
class-agnostic, where R is the total number of predicted masks in all images and C is
the number of foreground classes. The values are logits.
classes (list): A list of length R that contains either predicted of ground truth class
for eash predicted mask.
Returns:
scores (Tensor): A tensor of shape (R, 1, ...) that contains uncertainty scores with
the most uncertain locations having the highest uncertainty score.
"""
if logits.shape[1] == 1:
gt_class_logits = logits.clone()
else:
gt_class_logits = logits[
torch.arange(logits.shape[0], device=logits.device), classes
].unsqueeze(1)
return -(torch.abs(gt_class_logits))
@ROI_HEADS_REGISTRY.register()
class PointRendROIHeads(StandardROIHeads):
"""
The RoI heads class for PointRend instance segmentation models.
In this class we redefine the mask head of `StandardROIHeads` leaving all other heads intact.
To avoid namespace conflict with other heads we use names starting from `mask_` for all
variables that correspond to the mask head in the class's namespace.
"""
def __init__(self, cfg, input_shape):
# TODO use explicit args style
super().__init__(cfg, input_shape)
self._init_mask_head(cfg, input_shape)
def _init_mask_head(self, cfg, input_shape):
# fmt: off
self.mask_on = cfg.MODEL.MASK_ON
if not self.mask_on:
return
self.mask_coarse_in_features = cfg.MODEL.ROI_MASK_HEAD.IN_FEATURES
self.mask_coarse_side_size = cfg.MODEL.ROI_MASK_HEAD.POOLER_RESOLUTION
self._feature_scales = {k: 1.0 / v.stride for k, v in input_shape.items()}
# fmt: on
in_channels = np.sum([input_shape[f].channels for f in self.mask_coarse_in_features])
self.mask_coarse_head = build_mask_head(
cfg,
ShapeSpec(
channels=in_channels,
width=self.mask_coarse_side_size,
height=self.mask_coarse_side_size,
),
)
self._init_point_head(cfg, input_shape)
def _init_point_head(self, cfg, input_shape):
# fmt: off
self.mask_point_on = cfg.MODEL.ROI_MASK_HEAD.POINT_HEAD_ON
if not self.mask_point_on:
return
assert cfg.MODEL.ROI_HEADS.NUM_CLASSES == cfg.MODEL.POINT_HEAD.NUM_CLASSES
self.mask_point_in_features = cfg.MODEL.POINT_HEAD.IN_FEATURES
self.mask_point_train_num_points = cfg.MODEL.POINT_HEAD.TRAIN_NUM_POINTS
self.mask_point_oversample_ratio = cfg.MODEL.POINT_HEAD.OVERSAMPLE_RATIO
self.mask_point_importance_sample_ratio = cfg.MODEL.POINT_HEAD.IMPORTANCE_SAMPLE_RATIO
# next two parameters are use in the adaptive subdivions inference procedure
self.mask_point_subdivision_steps = cfg.MODEL.POINT_HEAD.SUBDIVISION_STEPS
self.mask_point_subdivision_num_points = cfg.MODEL.POINT_HEAD.SUBDIVISION_NUM_POINTS
# fmt: on
in_channels = np.sum([input_shape[f].channels for f in self.mask_point_in_features])
self.mask_point_head = build_point_head(
cfg, ShapeSpec(channels=in_channels, width=1, height=1)
)
def _forward_mask(self, features, instances):
"""
Forward logic of the mask prediction branch.
Args:
features (dict[str, Tensor]): #level input features for mask prediction
instances (list[Instances]): the per-image instances to train/predict masks.
In training, they can be the proposals.
In inference, they can be the predicted boxes.
Returns:
In training, a dict of losses.
In inference, update `instances` with new fields "pred_masks" and return it.
"""
if not self.mask_on:
return {} if self.training else instances
if self.training:
proposals, _ = select_foreground_proposals(instances, self.num_classes)
proposal_boxes = [x.proposal_boxes for x in proposals]
mask_coarse_logits = self._forward_mask_coarse(features, proposal_boxes)
losses = {"loss_mask": mask_rcnn_loss(mask_coarse_logits, proposals)}
losses.update(self._forward_mask_point(features, mask_coarse_logits, proposals))
return losses
else:
pred_boxes = [x.pred_boxes for x in instances]
mask_coarse_logits = self._forward_mask_coarse(features, pred_boxes)
mask_logits = self._forward_mask_point(features, mask_coarse_logits, instances)
mask_rcnn_inference(mask_logits, instances)
return instances
def _forward_mask_coarse(self, features, boxes):
"""
Forward logic of the coarse mask head.
"""
point_coords = generate_regular_grid_point_coords(
np.sum(len(x) for x in boxes), self.mask_coarse_side_size, boxes[0].device
)
mask_coarse_features_list = [features[k] for k in self.mask_coarse_in_features]
features_scales = [self._feature_scales[k] for k in self.mask_coarse_in_features]
# For regular grids of points, this function is equivalent to `len(features_list)' calls
# of `ROIAlign` (with `SAMPLING_RATIO=2`), and concat the results.
mask_features, _ = point_sample_fine_grained_features(
mask_coarse_features_list, features_scales, boxes, point_coords
)
return self.mask_coarse_head(mask_features)
def _forward_mask_point(self, features, mask_coarse_logits, instances):
"""
Forward logic of the mask point head.
"""
if not self.mask_point_on:
return {} if self.training else mask_coarse_logits
mask_features_list = [features[k] for k in self.mask_point_in_features]
features_scales = [self._feature_scales[k] for k in self.mask_point_in_features]
if self.training:
proposal_boxes = [x.proposal_boxes for x in instances]
gt_classes = cat([x.gt_classes for x in instances])
with torch.no_grad():
point_coords = get_uncertain_point_coords_with_randomness(
mask_coarse_logits,
lambda logits: calculate_uncertainty(logits, gt_classes),
self.mask_point_train_num_points,
self.mask_point_oversample_ratio,
self.mask_point_importance_sample_ratio,
)
fine_grained_features, point_coords_wrt_image = point_sample_fine_grained_features(
mask_features_list, features_scales, proposal_boxes, point_coords
)
coarse_features = point_sample(mask_coarse_logits, point_coords, align_corners=False)
point_logits = self.mask_point_head(fine_grained_features, coarse_features)
return {
"loss_mask_point": roi_mask_point_loss(
point_logits, instances, point_coords_wrt_image
)
}
else:
pred_boxes = [x.pred_boxes for x in instances]
pred_classes = cat([x.pred_classes for x in instances])
# The subdivision code will fail with the empty list of boxes
if len(pred_classes) == 0:
return mask_coarse_logits
mask_logits = mask_coarse_logits.clone()
for subdivions_step in range(self.mask_point_subdivision_steps):
mask_logits = interpolate(
mask_logits, scale_factor=2, mode="bilinear", align_corners=False
)
# If `mask_point_subdivision_num_points` is larger or equal to the
# resolution of the next step, then we can skip this step
H, W = mask_logits.shape[-2:]
if (
self.mask_point_subdivision_num_points >= 4 * H * W
and subdivions_step < self.mask_point_subdivision_steps - 1
):
continue
uncertainty_map = calculate_uncertainty(mask_logits, pred_classes)
point_indices, point_coords = get_uncertain_point_coords_on_grid(
uncertainty_map, self.mask_point_subdivision_num_points
)
fine_grained_features, _ = point_sample_fine_grained_features(
mask_features_list, features_scales, pred_boxes, point_coords
)
coarse_features = point_sample(
mask_coarse_logits, point_coords, align_corners=False
)
point_logits = self.mask_point_head(fine_grained_features, coarse_features)
# put mask point predictions to the right places on the upsampled grid.
R, C, H, W = mask_logits.shape
point_indices = point_indices.unsqueeze(1).expand(-1, C, -1)
mask_logits = (
mask_logits.reshape(R, C, H * W)
.scatter_(2, point_indices, point_logits)
.view(R, C, H, W)
)
return mask_logits

View File

@@ -0,0 +1,134 @@
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
import numpy as np
from typing import Dict
import torch
from torch import nn
from torch.nn import functional as F
from detectron2.layers import ShapeSpec, cat
from detectron2.modeling import SEM_SEG_HEADS_REGISTRY
from .point_features import (
get_uncertain_point_coords_on_grid,
get_uncertain_point_coords_with_randomness,
point_sample,
)
from .point_head import build_point_head
def calculate_uncertainty(sem_seg_logits):
"""
For each location of the prediction `sem_seg_logits` we estimate uncerainty as the
difference between top first and top second predicted logits.
Args:
mask_logits (Tensor): A tensor of shape (N, C, ...), where N is the minibatch size and
C is the number of foreground classes. The values are logits.
Returns:
scores (Tensor): A tensor of shape (N, 1, ...) that contains uncertainty scores with
the most uncertain locations having the highest uncertainty score.
"""
top2_scores = torch.topk(sem_seg_logits, k=2, dim=1)[0]
return (top2_scores[:, 1] - top2_scores[:, 0]).unsqueeze(1)
@SEM_SEG_HEADS_REGISTRY.register()
class PointRendSemSegHead(nn.Module):
"""
A semantic segmentation head that combines a head set in `POINT_HEAD.COARSE_SEM_SEG_HEAD_NAME`
and a point head set in `MODEL.POINT_HEAD.NAME`.
"""
def __init__(self, cfg, input_shape: Dict[str, ShapeSpec]):
super().__init__()
self.ignore_value = cfg.MODEL.SEM_SEG_HEAD.IGNORE_VALUE
self.coarse_sem_seg_head = SEM_SEG_HEADS_REGISTRY.get(
cfg.MODEL.POINT_HEAD.COARSE_SEM_SEG_HEAD_NAME
)(cfg, input_shape)
self._init_point_head(cfg, input_shape)
def _init_point_head(self, cfg, input_shape: Dict[str, ShapeSpec]):
# fmt: off
assert cfg.MODEL.SEM_SEG_HEAD.NUM_CLASSES == cfg.MODEL.POINT_HEAD.NUM_CLASSES
feature_channels = {k: v.channels for k, v in input_shape.items()}
self.in_features = cfg.MODEL.POINT_HEAD.IN_FEATURES
self.train_num_points = cfg.MODEL.POINT_HEAD.TRAIN_NUM_POINTS
self.oversample_ratio = cfg.MODEL.POINT_HEAD.OVERSAMPLE_RATIO
self.importance_sample_ratio = cfg.MODEL.POINT_HEAD.IMPORTANCE_SAMPLE_RATIO
self.subdivision_steps = cfg.MODEL.POINT_HEAD.SUBDIVISION_STEPS
self.subdivision_num_points = cfg.MODEL.POINT_HEAD.SUBDIVISION_NUM_POINTS
# fmt: on
in_channels = np.sum([feature_channels[f] for f in self.in_features])
self.point_head = build_point_head(cfg, ShapeSpec(channels=in_channels, width=1, height=1))
def forward(self, features, targets=None):
coarse_sem_seg_logits = self.coarse_sem_seg_head.layers(features)
if self.training:
losses = self.coarse_sem_seg_head.losses(coarse_sem_seg_logits, targets)
with torch.no_grad():
point_coords = get_uncertain_point_coords_with_randomness(
coarse_sem_seg_logits,
calculate_uncertainty,
self.train_num_points,
self.oversample_ratio,
self.importance_sample_ratio,
)
coarse_features = point_sample(coarse_sem_seg_logits, point_coords, align_corners=False)
fine_grained_features = cat(
[
point_sample(features[in_feature], point_coords, align_corners=False)
for in_feature in self.in_features
]
)
point_logits = self.point_head(fine_grained_features, coarse_features)
point_targets = (
point_sample(
targets.unsqueeze(1).to(torch.float),
point_coords,
mode="nearest",
align_corners=False,
)
.squeeze(1)
.to(torch.long)
)
losses["loss_sem_seg_point"] = F.cross_entropy(
point_logits, point_targets, reduction="mean", ignore_index=self.ignore_value
)
return None, losses
else:
sem_seg_logits = coarse_sem_seg_logits.clone()
for _ in range(self.subdivision_steps):
sem_seg_logits = F.interpolate(
sem_seg_logits, scale_factor=2, mode="bilinear", align_corners=False
)
uncertainty_map = calculate_uncertainty(sem_seg_logits)
point_indices, point_coords = get_uncertain_point_coords_on_grid(
uncertainty_map, self.subdivision_num_points
)
fine_grained_features = cat(
[
point_sample(features[in_feature], point_coords, align_corners=False)
for in_feature in self.in_features
]
)
coarse_features = point_sample(
coarse_sem_seg_logits, point_coords, align_corners=False
)
point_logits = self.point_head(fine_grained_features, coarse_features)
# put sem seg point predictions to the right places on the upsampled grid.
N, C, H, W = sem_seg_logits.shape
point_indices = point_indices.unsqueeze(1).expand(-1, C, -1)
sem_seg_logits = (
sem_seg_logits.reshape(N, C, H * W)
.scatter_(2, point_indices, point_logits)
.view(N, C, H, W)
)
return sem_seg_logits, {}

View File

@@ -0,0 +1,2 @@
python finetune_net.py --config-file configs/InstanceSegmentation/pointrend_rcnn_X_101_32x8d_FPN_3x_parsing.yaml --num-gpus 1
#python finetune_net.py --config-file configs/InstanceSegmentation/pointrend_rcnn_R_50_FPN_3x_parsing.yaml --num-gpus 1

View File

@@ -0,0 +1,133 @@
#!/usr/bin/env python3
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
"""
PointRend Training Script.
This script is a simplified version of the training script in detectron2/tools.
"""
import os
import torch
import detectron2.utils.comm as comm
from detectron2.checkpoint import DetectionCheckpointer
from detectron2.config import get_cfg
from detectron2.data import MetadataCatalog, build_detection_train_loader
from detectron2.engine import DefaultTrainer, default_argument_parser, default_setup, launch
from detectron2.evaluation import (
CityscapesInstanceEvaluator,
CityscapesSemSegEvaluator,
COCOEvaluator,
DatasetEvaluators,
LVISEvaluator,
SemSegEvaluator,
verify_results,
)
from point_rend import SemSegDatasetMapper, add_pointrend_config
class Trainer(DefaultTrainer):
"""
We use the "DefaultTrainer" which contains a number pre-defined logic for
standard training workflow. They may not work for you, especially if you
are working on a new research project. In that case you can use the cleaner
"SimpleTrainer", or write your own training loop.
"""
@classmethod
def build_evaluator(cls, cfg, dataset_name, output_folder=None):
"""
Create evaluator(s) for a given dataset.
This uses the special metadata "evaluator_type" associated with each builtin dataset.
For your own dataset, you can simply create an evaluator manually in your
script and do not have to worry about the hacky if-else logic here.
"""
if output_folder is None:
output_folder = os.path.join(cfg.OUTPUT_DIR, "inference")
evaluator_list = []
evaluator_type = MetadataCatalog.get(dataset_name).evaluator_type
if evaluator_type == "lvis":
return LVISEvaluator(dataset_name, cfg, True, output_folder)
if evaluator_type == "coco":
return COCOEvaluator(dataset_name, cfg, True, output_folder)
if evaluator_type == "sem_seg":
return SemSegEvaluator(
dataset_name,
distributed=True,
num_classes=cfg.MODEL.SEM_SEG_HEAD.NUM_CLASSES,
ignore_label=cfg.MODEL.SEM_SEG_HEAD.IGNORE_VALUE,
output_dir=output_folder,
)
if evaluator_type == "cityscapes_instance":
assert (
torch.cuda.device_count() >= comm.get_rank()
), "CityscapesEvaluator currently do not work with multiple machines."
return CityscapesInstanceEvaluator(dataset_name)
if evaluator_type == "cityscapes_sem_seg":
assert (
torch.cuda.device_count() >= comm.get_rank()
), "CityscapesEvaluator currently do not work with multiple machines."
return CityscapesSemSegEvaluator(dataset_name)
if len(evaluator_list) == 0:
raise NotImplementedError(
"no Evaluator for the dataset {} with the type {}".format(
dataset_name, evaluator_type
)
)
if len(evaluator_list) == 1:
return evaluator_list[0]
return DatasetEvaluators(evaluator_list)
@classmethod
def build_train_loader(cls, cfg):
if "SemanticSegmentor" in cfg.MODEL.META_ARCHITECTURE:
mapper = SemSegDatasetMapper(cfg, True)
else:
mapper = None
return build_detection_train_loader(cfg, mapper=mapper)
def setup(args):
"""
Create configs and perform basic setups.
"""
cfg = get_cfg()
add_pointrend_config(cfg)
cfg.merge_from_file(args.config_file)
cfg.merge_from_list(args.opts)
cfg.freeze()
default_setup(cfg, args)
return cfg
def main(args):
cfg = setup(args)
if args.eval_only:
model = Trainer.build_model(cfg)
DetectionCheckpointer(model, save_dir=cfg.OUTPUT_DIR).resume_or_load(
cfg.MODEL.WEIGHTS, resume=args.resume
)
res = Trainer.test(cfg, model)
if comm.is_main_process():
verify_results(cfg, res)
return res
trainer = Trainer(cfg)
trainer.resume_or_load(resume=args.resume)
return trainer.train()
if __name__ == "__main__":
args = default_argument_parser().parse_args()
print("Command Line Args:", args)
launch(
main,
args.num_gpus,
num_machines=args.num_machines,
machine_rank=args.machine_rank,
dist_url=args.dist_url,
args=(args,),
)

View File

@@ -0,0 +1,31 @@
Here are a few projects that are built on detectron2.
They are examples of how to use detectron2 as a library, to make your projects more
maintainable.
## Projects by Facebook
Note that these are research projects, and therefore may not have the same level
of support or stability of detectron2.
+ [DensePose: Dense Human Pose Estimation In The Wild](DensePose)
+ [Scale-Aware Trident Networks for Object Detection](TridentNet)
+ [TensorMask: A Foundation for Dense Object Segmentation](TensorMask)
+ [Mesh R-CNN](https://github.com/facebookresearch/meshrcnn)
+ [PointRend: Image Segmentation as Rendering](PointRend)
+ [Momentum Contrast for Unsupervised Visual Representation Learning](https://github.com/facebookresearch/moco/tree/master/detection)
## External Projects
External projects in the community that use detectron2:
<!--
- If you want to contribute, note that:
- 1. please add your project to the end of the list and try to use only one line
- 2. the project must provide models trained on standard data
-->
+ [VoVNet backbones](https://github.com/youngwanLEE/vovnet-detectron2).
+ [AdelaiDet](https://github.com/aim-uofa/adet), a detection toolbox from the Universtiy of Adelaide.
+ [CenterMask : Real-Time Anchor-Free Instance Segmentation](https://github.com/youngwanLEE/centermask2)

View File

@@ -0,0 +1,64 @@
# TensorMask in Detectron2
**A Foundation for Dense Object Segmentation**
Xinlei Chen, Ross Girshick, Kaiming He, Piotr Dollár
[[`arXiv`](https://arxiv.org/abs/1903.12174)] [[`BibTeX`](#CitingTensorMask)]
<div align="center">
<img src="http://xinleic.xyz/images/tmask.png" width="700px" />
</div>
In this repository, we release code for TensorMask in Detectron2.
TensorMask is a dense sliding-window instance segmentation framework that, for the first time, achieves results close to the well-developed Mask R-CNN framework -- both qualitatively and quantitatively. It establishes a conceptually complementary direction for object instance segmentation research.
## Installation
First install Detectron2 following the [documentation](https://detectron2.readthedocs.io/tutorials/install.html) and
[setup the dataset](../../datasets). Then compile the TensorMask-specific op (`swap_align2nat`):
```bash
cd /path/to/detectron2/projects/TensorMask
python setup.py build develop
```
## Training
To train a model, run:
```bash
python /path/to/detectron2/projects/TensorMask/train_net.py --config-file <config.yaml>
```
For example, to launch TensorMask BiPyramid training (1x schedule) with ResNet-50 backbone on 8 GPUs,
one should execute:
```bash
python /path/to/detectron2/projects/TensorMask/train_net.py --config-file configs/tensormask_R_50_FPN_1x.yaml --num-gpus 8
```
## Evaluation
Model evaluation can be done similarly (6x schedule with scale augmentation):
```bash
python /path/to/detectron2/projects/TensorMask/train_net.py --config-file configs/tensormask_R_50_FPN_6x.yaml --eval-only MODEL.WEIGHTS /path/to/model_checkpoint
```
# Pretrained Models
| Backbone | lr sched | AP box | AP mask | download |
| -------- | -------- | -- | --- | -------- |
| R50 | 1x | 37.6 | 32.4 | <a href="https://dl.fbaipublicfiles.com/detectron2/TensorMask/tensormask_R_50_FPN_1x/152549419/model_final_8f325c.pkl">model</a>&nbsp;\| &nbsp;<a href="https://dl.fbaipublicfiles.com/detectron2/TensorMask/tensormask_R_50_FPN_1x/152549419/metrics.json">metrics</a> |
| R50 | 6x | 41.4 | 35.8 | <a href="https://dl.fbaipublicfiles.com/detectron2/TensorMask/tensormask_R_50_FPN_6x/153538791/model_final_e8df31.pkl">model</a>&nbsp;\| &nbsp;<a href="https://dl.fbaipublicfiles.com/detectron2/TensorMask/tensormask_R_50_FPN_6x/153538791/metrics.json">metrics</a> |
## <a name="CitingTensorMask"></a>Citing TensorMask
If you use TensorMask, please use the following BibTeX entry.
```
@InProceedings{chen2019tensormask,
title={Tensormask: A Foundation for Dense Object Segmentation},
author={Chen, Xinlei and Girshick, Ross and He, Kaiming and Doll{\'a}r, Piotr},
journal={The International Conference on Computer Vision (ICCV)},
year={2019}
}
```

View File

@@ -0,0 +1,25 @@
MODEL:
META_ARCHITECTURE: "TensorMask"
MASK_ON: True
BACKBONE:
NAME: "build_retinanet_resnet_fpn_backbone"
RESNETS:
OUT_FEATURES: ["res2", "res3", "res4", "res5"]
ANCHOR_GENERATOR:
SIZES: [[44, 60], [88, 120], [176, 240], [352, 480], [704, 960], [1408, 1920]]
ASPECT_RATIOS: [[1.0]]
FPN:
IN_FEATURES: ["res2", "res3", "res4", "res5"]
FUSE_TYPE: "avg"
TENSOR_MASK:
ALIGNED_ON: True
BIPYRAMID_ON: True
DATASETS:
TRAIN: ("coco_2017_train",)
TEST: ("coco_2017_val",)
SOLVER:
IMS_PER_BATCH: 16
BASE_LR: 0.02
STEPS: (60000, 80000)
MAX_ITER: 90000
VERSION: 2

View File

@@ -0,0 +1,5 @@
_BASE_: "Base-TensorMask.yaml"
MODEL:
WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
RESNETS:
DEPTH: 50

View File

@@ -0,0 +1,11 @@
_BASE_: "Base-TensorMask.yaml"
MODEL:
WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
RESNETS:
DEPTH: 50
SOLVER:
STEPS: (480000, 520000)
MAX_ITER: 540000
INPUT:
MIN_SIZE_TRAIN_SAMPLING: "range"
MIN_SIZE_TRAIN: (640, 800)

View File

@@ -0,0 +1,69 @@
#!/usr/bin/env python
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
import glob
import os
from setuptools import find_packages, setup
import torch
from torch.utils.cpp_extension import CUDA_HOME, CppExtension, CUDAExtension
def get_extensions():
this_dir = os.path.dirname(os.path.abspath(__file__))
extensions_dir = os.path.join(this_dir, "tensormask", "layers", "csrc")
main_source = os.path.join(extensions_dir, "vision.cpp")
sources = glob.glob(os.path.join(extensions_dir, "**", "*.cpp"))
source_cuda = glob.glob(os.path.join(extensions_dir, "**", "*.cu")) + glob.glob(
os.path.join(extensions_dir, "*.cu")
)
sources = [main_source] + sources
extension = CppExtension
extra_compile_args = {"cxx": []}
define_macros = []
if (torch.cuda.is_available() and CUDA_HOME is not None) or os.getenv("FORCE_CUDA", "0") == "1":
extension = CUDAExtension
sources += source_cuda
define_macros += [("WITH_CUDA", None)]
extra_compile_args["nvcc"] = [
"-DCUDA_HAS_FP16=1",
"-D__CUDA_NO_HALF_OPERATORS__",
"-D__CUDA_NO_HALF_CONVERSIONS__",
"-D__CUDA_NO_HALF2_OPERATORS__",
]
# It's better if pytorch can do this by default ..
CC = os.environ.get("CC", None)
if CC is not None:
extra_compile_args["nvcc"].append("-ccbin={}".format(CC))
sources = [os.path.join(extensions_dir, s) for s in sources]
include_dirs = [extensions_dir]
ext_modules = [
extension(
"tensormask._C",
sources,
include_dirs=include_dirs,
define_macros=define_macros,
extra_compile_args=extra_compile_args,
)
]
return ext_modules
setup(
name="tensormask",
version="0.1",
author="FAIR",
packages=find_packages(exclude=("configs", "tests")),
python_requires=">=3.6",
ext_modules=get_extensions(),
cmdclass={"build_ext": torch.utils.cpp_extension.BuildExtension},
)

View File

@@ -0,0 +1,3 @@
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
from .config import add_tensormask_config
from .arch import TensorMask

View File

@@ -0,0 +1,904 @@
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
import copy
import logging
import math
from typing import List
import torch
import torch.nn.functional as F
from fvcore.nn import sigmoid_focal_loss_star_jit, smooth_l1_loss
from torch import nn
from detectron2.layers import ShapeSpec, batched_nms, cat, paste_masks_in_image
from detectron2.modeling.anchor_generator import DefaultAnchorGenerator
from detectron2.modeling.backbone import build_backbone
from detectron2.modeling.box_regression import Box2BoxTransform
from detectron2.modeling.meta_arch.build import META_ARCH_REGISTRY
from detectron2.modeling.meta_arch.retinanet import (
permute_all_cls_and_box_to_N_HWA_K_and_concat,
permute_to_N_HWA_K,
)
from detectron2.structures import Boxes, ImageList, Instances
from detectron2.utils.logger import log_first_n
from tensormask.layers import SwapAlign2Nat
__all__ = ["TensorMask"]
def _assignment_rule(
gt_boxes,
anchor_boxes,
unit_lengths,
min_anchor_size,
scale_thresh=2.0,
spatial_thresh=1.0,
uniqueness_on=True,
):
"""
Given two lists of boxes of N ground truth boxes and M anchor boxes,
compute the assignment between the two, following the assignment rules in
https://arxiv.org/abs/1903.12174.
The box order must be (xmin, ymin, xmax, ymax), so please make sure to convert
to BoxMode.XYXY_ABS before calling this function.
Args:
gt_boxes, anchor_boxes (Boxes): two Boxes. Contains N & M boxes/anchors, respectively.
unit_lengths (Tensor): Contains the unit lengths of M anchor boxes.
min_anchor_size (float): Minimum size of the anchor, in pixels
scale_thresh (float): The `scale` threshold: the maximum size of the anchor
should not be greater than scale_thresh x max(h, w) of
the ground truth box.
spatial_thresh (float): The `spatial` threshold: the l2 distance between the
center of the anchor and the ground truth box should not
be greater than spatial_thresh x u where u is the unit length.
Returns:
matches (Tensor[int64]): a vector of length M, where matches[i] is a matched
ground-truth index in [0, N)
match_labels (Tensor[int8]): a vector of length M, where pred_labels[i] indicates
whether a prediction is a true or false positive or ignored
"""
gt_boxes, anchor_boxes = gt_boxes.tensor, anchor_boxes.tensor
N = gt_boxes.shape[0]
M = anchor_boxes.shape[0]
if N == 0 or M == 0:
return (
gt_boxes.new_full((N,), 0, dtype=torch.int64),
gt_boxes.new_full((N,), -1, dtype=torch.int8),
)
# Containment rule
lt = torch.min(gt_boxes[:, None, :2], anchor_boxes[:, :2]) # [N,M,2]
rb = torch.max(gt_boxes[:, None, 2:], anchor_boxes[:, 2:]) # [N,M,2]
union = cat([lt, rb], dim=2) # [N,M,4]
dummy_gt_boxes = torch.zeros_like(gt_boxes)
anchor = dummy_gt_boxes[:, None, :] + anchor_boxes[:, :] # [N,M,4]
contain_matrix = torch.all(union == anchor, dim=2) # [N,M]
# Centrality rule, scale
gt_size_lower = torch.max(gt_boxes[:, 2:] - gt_boxes[:, :2], dim=1)[0] # [N]
gt_size_upper = gt_size_lower * scale_thresh # [N]
# Fall back for small objects
gt_size_upper[gt_size_upper < min_anchor_size] = min_anchor_size
# Due to sampling of locations, the anchor sizes are deducted with sampling strides
anchor_size = (
torch.max(anchor_boxes[:, 2:] - anchor_boxes[:, :2], dim=1)[0] - unit_lengths
) # [M]
size_diff_upper = gt_size_upper[:, None] - anchor_size # [N,M]
scale_matrix = size_diff_upper >= 0 # [N,M]
# Centrality rule, spatial
gt_center = (gt_boxes[:, 2:] + gt_boxes[:, :2]) / 2 # [N,2]
anchor_center = (anchor_boxes[:, 2:] + anchor_boxes[:, :2]) / 2 # [M,2]
offset_center = gt_center[:, None, :] - anchor_center[:, :] # [N,M,2]
offset_center /= unit_lengths[:, None] # [N,M,2]
spatial_square = spatial_thresh * spatial_thresh
spatial_matrix = torch.sum(offset_center * offset_center, dim=2) <= spatial_square
assign_matrix = (contain_matrix & scale_matrix & spatial_matrix).int()
# assign_matrix is N (gt) x M (predicted)
# Max over gt elements (dim 0) to find best gt candidate for each prediction
matched_vals, matches = assign_matrix.max(dim=0)
match_labels = matches.new_full(matches.size(), 1, dtype=torch.int8)
match_labels[matched_vals == 0] = 0
match_labels[matched_vals == 1] = 1
# find all the elements that match to ground truths multiple times
not_unique_idxs = assign_matrix.sum(dim=0) > 1
if uniqueness_on:
match_labels[not_unique_idxs] = 0
else:
match_labels[not_unique_idxs] = -1
return matches, match_labels
# TODO make the paste_mask function in d2 core support mask list
def _paste_mask_lists_in_image(masks, boxes, image_shape, threshold=0.5):
"""
Paste a list of masks that are of various resolutions (e.g., 28 x 28) into an image.
The location, height, and width for pasting each mask is determined by their
corresponding bounding boxes in boxes.
Args:
masks (list(Tensor)): A list of Tensor of shape (1, Hmask_i, Wmask_i).
Values are in [0, 1]. The list length, Bimg, is the
number of detected object instances in the image.
boxes (Boxes): A Boxes of length Bimg. boxes.tensor[i] and masks[i] correspond
to the same object instance.
image_shape (tuple): height, width
threshold (float): A threshold in [0, 1] for converting the (soft) masks to
binary masks.
Returns:
img_masks (Tensor): A tensor of shape (Bimg, Himage, Wimage), where Bimg is the
number of detected object instances and Himage, Wimage are the image width
and height. img_masks[i] is a binary mask for object instance i.
"""
if len(masks) == 0:
return torch.empty((0, 1) + image_shape, dtype=torch.uint8)
# Loop over masks groups. Each group has the same mask prediction size.
img_masks = []
ind_masks = []
mask_sizes = torch.tensor([m.shape[-1] for m in masks])
unique_sizes = torch.unique(mask_sizes)
for msize in unique_sizes.tolist():
cur_ind = torch.where(mask_sizes == msize)[0]
ind_masks.append(cur_ind)
cur_masks = cat([masks[i] for i in cur_ind])
cur_boxes = boxes[cur_ind]
img_masks.append(paste_masks_in_image(cur_masks, cur_boxes, image_shape, threshold))
img_masks = cat(img_masks)
ind_masks = cat(ind_masks)
img_masks_out = torch.empty_like(img_masks)
img_masks_out[ind_masks, :, :] = img_masks
return img_masks_out
def _postprocess(results, result_mask_info, output_height, output_width, mask_threshold=0.5):
"""
Post-process the output boxes for TensorMask.
The input images are often resized when entering an object detector.
As a result, we often need the outputs of the detector in a different
resolution from its inputs.
This function will postprocess the raw outputs of TensorMask
to produce outputs according to the desired output resolution.
Args:
results (Instances): the raw outputs from the detector.
`results.image_size` contains the input image resolution the detector sees.
This object might be modified in-place. Note that it does not contain the field
`pred_masks`, which is provided by another input `result_masks`.
result_mask_info (list[Tensor], Boxes): a pair of two items for mask related results.
The first item is a list of #detection tensors, each is the predicted masks.
The second item is the anchors corresponding to the predicted masks.
output_height, output_width: the desired output resolution.
Returns:
Instances: the postprocessed output from the model, based on the output resolution
"""
scale_x, scale_y = (output_width / results.image_size[1], output_height / results.image_size[0])
results = Instances((output_height, output_width), **results.get_fields())
output_boxes = results.pred_boxes
output_boxes.tensor[:, 0::2] *= scale_x
output_boxes.tensor[:, 1::2] *= scale_y
output_boxes.clip(results.image_size)
inds_nonempty = output_boxes.nonempty()
results = results[inds_nonempty]
result_masks, result_anchors = result_mask_info
if result_masks:
result_anchors.tensor[:, 0::2] *= scale_x
result_anchors.tensor[:, 1::2] *= scale_y
result_masks = [x for (i, x) in zip(inds_nonempty.tolist(), result_masks) if i]
results.pred_masks = _paste_mask_lists_in_image(
result_masks,
result_anchors[inds_nonempty],
results.image_size,
threshold=mask_threshold,
)
return results
class TensorMaskAnchorGenerator(DefaultAnchorGenerator):
"""
For a set of image sizes and feature maps, computes a set of anchors for TensorMask.
It also computes the unit lengths and indexes for each anchor box.
"""
def grid_anchors_with_unit_lengths_and_indexes(self, grid_sizes):
anchors = []
unit_lengths = []
indexes = []
for lvl, (size, stride, base_anchors) in enumerate(
zip(grid_sizes, self.strides, self.cell_anchors)
):
grid_height, grid_width = size
device = base_anchors.device
shifts_x = torch.arange(
0, grid_width * stride, step=stride, dtype=torch.float32, device=device
)
shifts_y = torch.arange(
0, grid_height * stride, step=stride, dtype=torch.float32, device=device
)
shift_y, shift_x = torch.meshgrid(shifts_y, shifts_x)
shifts = torch.stack((shift_x, shift_y, shift_x, shift_y), dim=2)
# Stack anchors in shapes of (HWA, 4)
cur_anchor = (shifts[:, :, None, :] + base_anchors.view(1, 1, -1, 4)).view(-1, 4)
anchors.append(cur_anchor)
unit_lengths.append(
torch.full((cur_anchor.shape[0],), stride, dtype=torch.float32, device=device)
)
# create mask indexes using mesh grid
shifts_l = torch.full((1,), lvl, dtype=torch.int64, device=device)
shifts_i = torch.zeros((1,), dtype=torch.int64, device=device)
shifts_h = torch.arange(0, grid_height, dtype=torch.int64, device=device)
shifts_w = torch.arange(0, grid_width, dtype=torch.int64, device=device)
shifts_a = torch.arange(0, base_anchors.shape[0], dtype=torch.int64, device=device)
grids = torch.meshgrid(shifts_l, shifts_i, shifts_h, shifts_w, shifts_a)
indexes.append(torch.stack(grids, dim=5).view(-1, 5))
return anchors, unit_lengths, indexes
def forward(self, features):
"""
Returns:
list[list[Boxes]]: a list of #image elements. Each is a list of #feature level Boxes.
The Boxes contains anchors of this image on the specific feature level.
list[list[Tensor]]: a list of #image elements. Each is a list of #feature level tensors.
The tensor contains strides, or unit lengths for the anchors.
list[list[Tensor]]: a list of #image elements. Each is a list of #feature level tensors.
The Tensor contains indexes for the anchors, with the last dimension meaning
(L, N, H, W, A), where L is level, I is image (not set yet), H is height,
W is width, and A is anchor.
"""
num_images = len(features[0])
grid_sizes = [feature_map.shape[-2:] for feature_map in features]
anchors_list, lengths_list, indexes_list = self.grid_anchors_with_unit_lengths_and_indexes(
grid_sizes
)
# Convert anchors from Tensor to Boxes
anchors_per_im = [Boxes(x) for x in anchors_list]
# TODO it can be simplified to not return duplicated information for
# each image, just like detectron2's own AnchorGenerator
anchors = [copy.deepcopy(anchors_per_im) for _ in range(num_images)]
unit_lengths = [copy.deepcopy(lengths_list) for _ in range(num_images)]
indexes = [copy.deepcopy(indexes_list) for _ in range(num_images)]
return anchors, unit_lengths, indexes
@META_ARCH_REGISTRY.register()
class TensorMask(nn.Module):
"""
TensorMask model. Creates FPN backbone, anchors and a head for classification
and box regression. Calculates and applies proper losses to class, box, and
masks.
"""
def __init__(self, cfg):
super().__init__()
# fmt: off
self.num_classes = cfg.MODEL.TENSOR_MASK.NUM_CLASSES
self.in_features = cfg.MODEL.TENSOR_MASK.IN_FEATURES
self.anchor_sizes = cfg.MODEL.ANCHOR_GENERATOR.SIZES
self.num_levels = len(cfg.MODEL.ANCHOR_GENERATOR.SIZES)
# Loss parameters:
self.focal_loss_alpha = cfg.MODEL.TENSOR_MASK.FOCAL_LOSS_ALPHA
self.focal_loss_gamma = cfg.MODEL.TENSOR_MASK.FOCAL_LOSS_GAMMA
# Inference parameters:
self.score_threshold = cfg.MODEL.TENSOR_MASK.SCORE_THRESH_TEST
self.topk_candidates = cfg.MODEL.TENSOR_MASK.TOPK_CANDIDATES_TEST
self.nms_threshold = cfg.MODEL.TENSOR_MASK.NMS_THRESH_TEST
self.detections_im = cfg.TEST.DETECTIONS_PER_IMAGE
# Mask parameters:
self.mask_on = cfg.MODEL.MASK_ON
self.mask_loss_weight = cfg.MODEL.TENSOR_MASK.MASK_LOSS_WEIGHT
self.mask_pos_weight = torch.tensor(cfg.MODEL.TENSOR_MASK.POSITIVE_WEIGHT,
dtype=torch.float32)
self.bipyramid_on = cfg.MODEL.TENSOR_MASK.BIPYRAMID_ON
# fmt: on
# build the backbone
self.backbone = build_backbone(cfg)
backbone_shape = self.backbone.output_shape()
feature_shapes = [backbone_shape[f] for f in self.in_features]
feature_strides = [x.stride for x in feature_shapes]
# build anchors
self.anchor_generator = TensorMaskAnchorGenerator(cfg, feature_shapes)
self.num_anchors = self.anchor_generator.num_cell_anchors[0]
anchors_min_level = cfg.MODEL.ANCHOR_GENERATOR.SIZES[0]
self.mask_sizes = [size // feature_strides[0] for size in anchors_min_level]
self.min_anchor_size = min(anchors_min_level) - feature_strides[0]
# head of the TensorMask
self.head = TensorMaskHead(
cfg, self.num_levels, self.num_anchors, self.mask_sizes, feature_shapes
)
# box transform
self.box2box_transform = Box2BoxTransform(weights=cfg.MODEL.TENSOR_MASK.BBOX_REG_WEIGHTS)
self.register_buffer("pixel_mean", torch.Tensor(cfg.MODEL.PIXEL_MEAN).view(-1, 1, 1))
self.register_buffer("pixel_std", torch.Tensor(cfg.MODEL.PIXEL_STD).view(-1, 1, 1))
@property
def device(self):
return self.pixel_mean.device
def forward(self, batched_inputs):
"""
Args:
batched_inputs: a list, batched outputs of :class:`DetectionTransform` .
Each item in the list contains the inputs for one image.
For now, each item in the list is a dict that contains:
image: Tensor, image in (C, H, W) format.
instances: Instances
Other information that's included in the original dicts, such as:
"height", "width" (int): the output resolution of the model, used in inference.
See :meth:`postprocess` for details.
Returns:
losses (dict[str: Tensor]): mapping from a named loss to a tensor
storing the loss. Used during training only.
"""
images = self.preprocess_image(batched_inputs)
if "instances" in batched_inputs[0]:
gt_instances = [x["instances"].to(self.device) for x in batched_inputs]
elif "targets" in batched_inputs[0]:
log_first_n(
logging.WARN, "'targets' in the model inputs is now renamed to 'instances'!", n=10
)
gt_instances = [x["targets"].to(self.device) for x in batched_inputs]
else:
gt_instances = None
features = self.backbone(images.tensor)
features = [features[f] for f in self.in_features]
# apply the TensorMask head
pred_logits, pred_deltas, pred_masks = self.head(features)
# generate anchors based on features, is it image specific?
anchors, unit_lengths, indexes = self.anchor_generator(features)
if self.training:
# get ground truths for class labels and box targets, it will label each anchor
gt_class_info, gt_delta_info, gt_mask_info, num_fg = self.get_ground_truth(
anchors, unit_lengths, indexes, gt_instances
)
# compute the loss
return self.losses(
gt_class_info,
gt_delta_info,
gt_mask_info,
num_fg,
pred_logits,
pred_deltas,
pred_masks,
)
else:
# do inference to get the output
results = self.inference(pred_logits, pred_deltas, pred_masks, anchors, indexes, images)
processed_results = []
for results_im, input_im, image_size in zip(
results, batched_inputs, images.image_sizes
):
height = input_im.get("height", image_size[0])
width = input_im.get("width", image_size[1])
# this is to do post-processing with the image size
result_box, result_mask = results_im
r = _postprocess(result_box, result_mask, height, width)
processed_results.append({"instances": r})
return processed_results
def losses(
self,
gt_class_info,
gt_delta_info,
gt_mask_info,
num_fg,
pred_logits,
pred_deltas,
pred_masks,
):
"""
Args:
For `gt_class_info`, `gt_delta_info`, `gt_mask_info` and `num_fg` parameters, see
:meth:`TensorMask.get_ground_truth`.
For `pred_logits`, `pred_deltas` and `pred_masks`, see
:meth:`TensorMaskHead.forward`.
Returns:
losses (dict[str: Tensor]): mapping from a named loss to a scalar tensor
storing the loss. Used during training only. The potential dict keys are:
"loss_cls", "loss_box_reg" and "loss_mask".
"""
gt_classes_target, gt_valid_inds = gt_class_info
gt_deltas, gt_fg_inds = gt_delta_info
gt_masks, gt_mask_inds = gt_mask_info
loss_normalizer = torch.tensor(max(1, num_fg), dtype=torch.float32, device=self.device)
# classification and regression
pred_logits, pred_deltas = permute_all_cls_and_box_to_N_HWA_K_and_concat(
pred_logits, pred_deltas, self.num_classes
)
loss_cls = (
sigmoid_focal_loss_star_jit(
pred_logits[gt_valid_inds],
gt_classes_target[gt_valid_inds],
alpha=self.focal_loss_alpha,
gamma=self.focal_loss_gamma,
reduction="sum",
)
/ loss_normalizer
)
if num_fg == 0:
loss_box_reg = pred_deltas.sum() * 0
else:
loss_box_reg = (
smooth_l1_loss(pred_deltas[gt_fg_inds], gt_deltas, beta=0.0, reduction="sum")
/ loss_normalizer
)
losses = {"loss_cls": loss_cls, "loss_box_reg": loss_box_reg}
# mask prediction
if self.mask_on:
loss_mask = 0
for lvl in range(self.num_levels):
cur_level_factor = 2 ** lvl if self.bipyramid_on else 1
for anc in range(self.num_anchors):
cur_gt_mask_inds = gt_mask_inds[lvl][anc]
if cur_gt_mask_inds is None:
loss_mask += pred_masks[lvl][anc][0, 0, 0, 0] * 0
else:
cur_mask_size = self.mask_sizes[anc] * cur_level_factor
# TODO maybe there are numerical issues when mask sizes are large
cur_size_divider = torch.tensor(
self.mask_loss_weight / (cur_mask_size ** 2),
dtype=torch.float32,
device=self.device,
)
cur_pred_masks = pred_masks[lvl][anc][
cur_gt_mask_inds[:, 0], # N
:, # V x U
cur_gt_mask_inds[:, 1], # H
cur_gt_mask_inds[:, 2], # W
]
loss_mask += F.binary_cross_entropy_with_logits(
cur_pred_masks.view(-1, cur_mask_size, cur_mask_size), # V, U
gt_masks[lvl][anc].to(dtype=torch.float32),
reduction="sum",
weight=cur_size_divider,
pos_weight=self.mask_pos_weight,
)
losses["loss_mask"] = loss_mask / loss_normalizer
return losses
@torch.no_grad()
def get_ground_truth(self, anchors, unit_lengths, indexes, targets):
"""
Args:
anchors (list[list[Boxes]]): a list of N=#image elements. Each is a
list of #feature level Boxes. The Boxes contains anchors of
this image on the specific feature level.
unit_lengths (list[list[Tensor]]): a list of N=#image elements. Each is a
list of #feature level Tensor. The tensor contains unit lengths for anchors of
this image on the specific feature level.
indexes (list[list[Tensor]]): a list of N=#image elements. Each is a
list of #feature level Tensor. The tensor contains the 5D index of
each anchor, the second dimension means (L, N, H, W, A), where L
is level, I is image, H is height, W is width, and A is anchor.
targets (list[Instances]): a list of N `Instances`s. The i-th
`Instances` contains the ground-truth per-instance annotations
for the i-th input image. Specify `targets` during training only.
Returns:
gt_class_info (Tensor, Tensor): A pair of two tensors for classification.
The first one is an integer tensor of shape (R, #classes) storing ground-truth
labels for each anchor. R is the total number of anchors in the batch.
The second one is an integer tensor of shape (R,), to indicate which
anchors are valid for loss computation, which anchors are not.
gt_delta_info (Tensor, Tensor): A pair of two tensors for boxes.
The first one, of shape (F, 4). F=#foreground anchors.
The last dimension represents ground-truth box2box transform
targets (dx, dy, dw, dh) that map each anchor to its matched ground-truth box.
Only foreground anchors have values in this tensor. Could be `None` if F=0.
The second one, of shape (R,), is an integer tensor indicating which anchors
are foreground ones used for box regression. Could be `None` if F=0.
gt_mask_info (list[list[Tensor]], list[list[Tensor]]): A pair of two lists for masks.
The first one is a list of P=#feature level elements. Each is a
list of A=#anchor tensors. Each tensor contains the ground truth
masks of the same size and for the same feature level. Could be `None`.
The second one is a list of P=#feature level elements. Each is a
list of A=#anchor tensors. Each tensor contains the location of the ground truth
masks of the same size and for the same feature level. The second dimension means
(N, H, W), where N is image, H is height, and W is width. Could be `None`.
num_fg (int): F=#foreground anchors, used later for loss normalization.
"""
gt_classes = []
gt_deltas = []
gt_masks = [[[] for _ in range(self.num_anchors)] for _ in range(self.num_levels)]
gt_mask_inds = [[[] for _ in range(self.num_anchors)] for _ in range(self.num_levels)]
anchors = [Boxes.cat(anchors_i) for anchors_i in anchors]
unit_lengths = [cat(unit_lengths_i) for unit_lengths_i in unit_lengths]
indexes = [cat(indexes_i) for indexes_i in indexes]
num_fg = 0
for i, (anchors_im, unit_lengths_im, indexes_im, targets_im) in enumerate(
zip(anchors, unit_lengths, indexes, targets)
):
# Initialize all
gt_classes_i = torch.full_like(
unit_lengths_im, self.num_classes, dtype=torch.int64, device=self.device
)
# Ground truth classes
has_gt = len(targets_im) > 0
if has_gt:
# Compute the pairwise matrix
gt_matched_inds, anchor_labels = _assignment_rule(
targets_im.gt_boxes, anchors_im, unit_lengths_im, self.min_anchor_size
)
# Find the foreground instances
fg_inds = anchor_labels == 1
fg_anchors = anchors_im[fg_inds]
num_fg += len(fg_anchors)
# Find the ground truths for foreground instances
gt_fg_matched_inds = gt_matched_inds[fg_inds]
# Assign labels for foreground instances
gt_classes_i[fg_inds] = targets_im.gt_classes[gt_fg_matched_inds]
# Anchors with label -1 are ignored, others are left as negative
gt_classes_i[anchor_labels == -1] = -1
# Boxes
# Ground truth box regression, only for foregrounds
matched_gt_boxes = targets_im[gt_fg_matched_inds].gt_boxes
# Compute box regression offsets for foregrounds only
gt_deltas_i = self.box2box_transform.get_deltas(
fg_anchors.tensor, matched_gt_boxes.tensor
)
gt_deltas.append(gt_deltas_i)
# Masks
if self.mask_on:
# Compute masks for each level and each anchor
matched_indexes = indexes_im[fg_inds, :]
for lvl in range(self.num_levels):
ids_lvl = matched_indexes[:, 0] == lvl
if torch.any(ids_lvl):
cur_level_factor = 2 ** lvl if self.bipyramid_on else 1
for anc in range(self.num_anchors):
ids_lvl_anchor = ids_lvl & (matched_indexes[:, 4] == anc)
if torch.any(ids_lvl_anchor):
gt_masks[lvl][anc].append(
targets_im[
gt_fg_matched_inds[ids_lvl_anchor]
].gt_masks.crop_and_resize(
fg_anchors[ids_lvl_anchor].tensor,
self.mask_sizes[anc] * cur_level_factor,
)
)
# Select (N, H, W) dimensions
gt_mask_inds_lvl_anc = matched_indexes[ids_lvl_anchor, 1:4]
# Set the image index to the current image
gt_mask_inds_lvl_anc[:, 0] = i
gt_mask_inds[lvl][anc].append(gt_mask_inds_lvl_anc)
gt_classes.append(gt_classes_i)
# Classes and boxes
gt_classes = cat(gt_classes)
gt_valid_inds = gt_classes >= 0
gt_fg_inds = gt_valid_inds & (gt_classes < self.num_classes)
gt_classes_target = torch.zeros(
(gt_classes.shape[0], self.num_classes), dtype=torch.float32, device=self.device
)
gt_classes_target[gt_fg_inds, gt_classes[gt_fg_inds]] = 1
gt_deltas = cat(gt_deltas) if gt_deltas else None
# Masks
gt_masks = [[cat(mla) if mla else None for mla in ml] for ml in gt_masks]
gt_mask_inds = [[cat(ila) if ila else None for ila in il] for il in gt_mask_inds]
return (
(gt_classes_target, gt_valid_inds),
(gt_deltas, gt_fg_inds),
(gt_masks, gt_mask_inds),
num_fg,
)
def inference(self, pred_logits, pred_deltas, pred_masks, anchors, indexes, images):
"""
Arguments:
pred_logits, pred_deltas, pred_masks: Same as the output of:
meth:`TensorMaskHead.forward`
anchors, indexes: Same as the input of meth:`TensorMask.get_ground_truth`
images (ImageList): the input images
Returns:
results (List[Instances]): a list of #images elements.
"""
assert len(anchors) == len(images)
results = []
pred_logits = [permute_to_N_HWA_K(x, self.num_classes) for x in pred_logits]
pred_deltas = [permute_to_N_HWA_K(x, 4) for x in pred_deltas]
pred_logits = cat(pred_logits, dim=1)
pred_deltas = cat(pred_deltas, dim=1)
for img_idx, (anchors_im, indexes_im) in enumerate(zip(anchors, indexes)):
# Get the size of the current image
image_size = images.image_sizes[img_idx]
logits_im = pred_logits[img_idx]
deltas_im = pred_deltas[img_idx]
if self.mask_on:
masks_im = [[mla[img_idx] for mla in ml] for ml in pred_masks]
else:
masks_im = [None] * self.num_levels
results_im = self.inference_single_image(
logits_im,
deltas_im,
masks_im,
Boxes.cat(anchors_im),
cat(indexes_im),
tuple(image_size),
)
results.append(results_im)
return results
def inference_single_image(
self, pred_logits, pred_deltas, pred_masks, anchors, indexes, image_size
):
"""
Single-image inference. Return bounding-box detection results by thresholding
on scores and applying non-maximum suppression (NMS).
Arguments:
pred_logits (list[Tensor]): list of #feature levels. Each entry contains
tensor of size (AxHxW, K)
pred_deltas (list[Tensor]): Same shape as 'pred_logits' except that K becomes 4.
pred_masks (list[list[Tensor]]): List of #feature levels, each is a list of #anchors.
Each entry contains tensor of size (M_i*M_i, H, W). `None` if mask_on=False.
anchors (list[Boxes]): list of #feature levels. Each entry contains
a Boxes object, which contains all the anchors for that
image in that feature level.
image_size (tuple(H, W)): a tuple of the image height and width.
Returns:
Same as `inference`, but for only one image.
"""
pred_logits = pred_logits.flatten().sigmoid_()
# We get top locations across all levels to accelerate the inference speed,
# which does not seem to affect the accuracy.
# First select values above the threshold
logits_top_idxs = torch.where(pred_logits > self.score_threshold)[0]
# Then get the top values
num_topk = min(self.topk_candidates, logits_top_idxs.shape[0])
pred_prob, topk_idxs = pred_logits[logits_top_idxs].sort(descending=True)
# Keep top k scoring values
pred_prob = pred_prob[:num_topk]
# Keep top k values
top_idxs = logits_top_idxs[topk_idxs[:num_topk]]
# class index
cls_idxs = top_idxs % self.num_classes
# HWA index
top_idxs //= self.num_classes
# predict boxes
pred_boxes = self.box2box_transform.apply_deltas(
pred_deltas[top_idxs], anchors[top_idxs].tensor
)
# apply nms
keep = batched_nms(pred_boxes, pred_prob, cls_idxs, self.nms_threshold)
# pick the top ones
keep = keep[: self.detections_im]
results = Instances(image_size)
results.pred_boxes = Boxes(pred_boxes[keep])
results.scores = pred_prob[keep]
results.pred_classes = cls_idxs[keep]
# deal with masks
result_masks, result_anchors = [], None
if self.mask_on:
# index and anchors, useful for masks
top_indexes = indexes[top_idxs]
top_anchors = anchors[top_idxs]
result_indexes = top_indexes[keep]
result_anchors = top_anchors[keep]
# Get masks and do sigmoid
for lvl, _, h, w, anc in result_indexes.tolist():
cur_size = self.mask_sizes[anc] * (2 ** lvl if self.bipyramid_on else 1)
result_masks.append(
torch.sigmoid(pred_masks[lvl][anc][:, h, w].view(1, cur_size, cur_size))
)
return results, (result_masks, result_anchors)
def preprocess_image(self, batched_inputs):
"""
Normalize, pad and batch the input images.
"""
images = [x["image"].to(self.device) for x in batched_inputs]
images = [(x - self.pixel_mean) / self.pixel_std for x in images]
images = ImageList.from_tensors(images, self.backbone.size_divisibility)
return images
class TensorMaskHead(nn.Module):
def __init__(self, cfg, num_levels, num_anchors, mask_sizes, input_shape: List[ShapeSpec]):
"""
TensorMask head.
"""
super().__init__()
# fmt: off
self.in_features = cfg.MODEL.TENSOR_MASK.IN_FEATURES
in_channels = input_shape[0].channels
num_classes = cfg.MODEL.TENSOR_MASK.NUM_CLASSES
cls_channels = cfg.MODEL.TENSOR_MASK.CLS_CHANNELS
num_convs = cfg.MODEL.TENSOR_MASK.NUM_CONVS
# box parameters
bbox_channels = cfg.MODEL.TENSOR_MASK.BBOX_CHANNELS
# mask parameters
self.mask_on = cfg.MODEL.MASK_ON
self.mask_sizes = mask_sizes
mask_channels = cfg.MODEL.TENSOR_MASK.MASK_CHANNELS
self.align_on = cfg.MODEL.TENSOR_MASK.ALIGNED_ON
self.bipyramid_on = cfg.MODEL.TENSOR_MASK.BIPYRAMID_ON
# fmt: on
# class subnet
cls_subnet = []
cur_channels = in_channels
for _ in range(num_convs):
cls_subnet.append(
nn.Conv2d(cur_channels, cls_channels, kernel_size=3, stride=1, padding=1)
)
cur_channels = cls_channels
cls_subnet.append(nn.ReLU())
self.cls_subnet = nn.Sequential(*cls_subnet)
self.cls_score = nn.Conv2d(
cur_channels, num_anchors * num_classes, kernel_size=3, stride=1, padding=1
)
modules_list = [self.cls_subnet, self.cls_score]
# box subnet
bbox_subnet = []
cur_channels = in_channels
for _ in range(num_convs):
bbox_subnet.append(
nn.Conv2d(cur_channels, bbox_channels, kernel_size=3, stride=1, padding=1)
)
cur_channels = bbox_channels
bbox_subnet.append(nn.ReLU())
self.bbox_subnet = nn.Sequential(*bbox_subnet)
self.bbox_pred = nn.Conv2d(
cur_channels, num_anchors * 4, kernel_size=3, stride=1, padding=1
)
modules_list.extend([self.bbox_subnet, self.bbox_pred])
# mask subnet
if self.mask_on:
mask_subnet = []
cur_channels = in_channels
for _ in range(num_convs):
mask_subnet.append(
nn.Conv2d(cur_channels, mask_channels, kernel_size=3, stride=1, padding=1)
)
cur_channels = mask_channels
mask_subnet.append(nn.ReLU())
self.mask_subnet = nn.Sequential(*mask_subnet)
modules_list.append(self.mask_subnet)
for mask_size in self.mask_sizes:
cur_mask_module = "mask_pred_%02d" % mask_size
self.add_module(
cur_mask_module,
nn.Conv2d(
cur_channels, mask_size * mask_size, kernel_size=1, stride=1, padding=0
),
)
modules_list.append(getattr(self, cur_mask_module))
if self.align_on:
if self.bipyramid_on:
for lvl in range(num_levels):
cur_mask_module = "align2nat_%02d" % lvl
lambda_val = 2 ** lvl
setattr(self, cur_mask_module, SwapAlign2Nat(lambda_val))
# Also the fusing layer, stay at the same channel size
mask_fuse = [
nn.Conv2d(cur_channels, cur_channels, kernel_size=3, stride=1, padding=1),
nn.ReLU(),
]
self.mask_fuse = nn.Sequential(*mask_fuse)
modules_list.append(self.mask_fuse)
else:
self.align2nat = SwapAlign2Nat(1)
# Initialization
for modules in modules_list:
for layer in modules.modules():
if isinstance(layer, nn.Conv2d):
torch.nn.init.normal_(layer.weight, mean=0, std=0.01)
torch.nn.init.constant_(layer.bias, 0)
# Use prior in model initialization to improve stability
bias_value = -(math.log((1 - 0.01) / 0.01))
torch.nn.init.constant_(self.cls_score.bias, bias_value)
def forward(self, features):
"""
Arguments:
features (list[Tensor]): FPN feature map tensors in high to low resolution.
Each tensor in the list correspond to different feature levels.
Returns:
pred_logits (list[Tensor]): #lvl tensors, each has shape (N, AxK, Hi, Wi).
The tensor predicts the classification probability
at each spatial position for each of the A anchors and K object
classes.
pred_deltas (list[Tensor]): #lvl tensors, each has shape (N, Ax4, Hi, Wi).
The tensor predicts 4-vector (dx,dy,dw,dh) box
regression values for every anchor. These values are the
relative offset between the anchor and the ground truth box.
pred_masks (list(list[Tensor])): #lvl list of tensors, each is a list of
A tensors of shape (N, M_{i,a}, Hi, Wi).
The tensor predicts a dense set of M_ixM_i masks at every location.
"""
pred_logits = [self.cls_score(self.cls_subnet(x)) for x in features]
pred_deltas = [self.bbox_pred(self.bbox_subnet(x)) for x in features]
pred_masks = None
if self.mask_on:
mask_feats = [self.mask_subnet(x) for x in features]
if self.bipyramid_on:
mask_feat_high_res = mask_feats[0]
H, W = mask_feat_high_res.shape[-2:]
mask_feats_up = []
for lvl, mask_feat in enumerate(mask_feats):
lambda_val = 2.0 ** lvl
mask_feat_up = mask_feat
if lvl > 0:
mask_feat_up = F.interpolate(
mask_feat, scale_factor=lambda_val, mode="bilinear", align_corners=False
)
mask_feats_up.append(
self.mask_fuse(mask_feat_up[:, :, :H, :W] + mask_feat_high_res)
)
mask_feats = mask_feats_up
pred_masks = []
for lvl, mask_feat in enumerate(mask_feats):
cur_masks = []
for mask_size in self.mask_sizes:
cur_mask_module = getattr(self, "mask_pred_%02d" % mask_size)
cur_mask = cur_mask_module(mask_feat)
if self.align_on:
if self.bipyramid_on:
cur_mask_module = getattr(self, "align2nat_%02d" % lvl)
cur_mask = cur_mask_module(cur_mask)
else:
cur_mask = self.align2nat(cur_mask)
cur_masks.append(cur_mask)
pred_masks.append(cur_masks)
return pred_logits, pred_deltas, pred_masks

View File

@@ -0,0 +1,50 @@
# -*- coding: utf-8 -*-
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
from detectron2.config import CfgNode as CN
def add_tensormask_config(cfg):
"""
Add config for TensorMask.
"""
cfg.MODEL.TENSOR_MASK = CN()
# Anchor parameters
cfg.MODEL.TENSOR_MASK.IN_FEATURES = ["p2", "p3", "p4", "p5", "p6", "p7"]
# Convolutions to use in the towers
cfg.MODEL.TENSOR_MASK.NUM_CONVS = 4
# Number of foreground classes.
cfg.MODEL.TENSOR_MASK.NUM_CLASSES = 80
# Channel size for the classification tower
cfg.MODEL.TENSOR_MASK.CLS_CHANNELS = 256
cfg.MODEL.TENSOR_MASK.SCORE_THRESH_TEST = 0.05
# Only the top (1000 * #levels) candidate boxes across all levels are
# considered jointly during test (to improve speed)
cfg.MODEL.TENSOR_MASK.TOPK_CANDIDATES_TEST = 6000
cfg.MODEL.TENSOR_MASK.NMS_THRESH_TEST = 0.5
# Box parameters
# Channel size for the box tower
cfg.MODEL.TENSOR_MASK.BBOX_CHANNELS = 128
# Weights on (dx, dy, dw, dh)
cfg.MODEL.TENSOR_MASK.BBOX_REG_WEIGHTS = (1.5, 1.5, 0.75, 0.75)
# Loss parameters
cfg.MODEL.TENSOR_MASK.FOCAL_LOSS_GAMMA = 3.0
cfg.MODEL.TENSOR_MASK.FOCAL_LOSS_ALPHA = 0.3
# Mask parameters
# Channel size for the mask tower
cfg.MODEL.TENSOR_MASK.MASK_CHANNELS = 128
# Mask loss weight
cfg.MODEL.TENSOR_MASK.MASK_LOSS_WEIGHT = 2.0
# weight on positive pixels within the mask
cfg.MODEL.TENSOR_MASK.POSITIVE_WEIGHT = 1.5
# Whether to predict in the aligned representation
cfg.MODEL.TENSOR_MASK.ALIGNED_ON = False
# Whether to use the bipyramid architecture
cfg.MODEL.TENSOR_MASK.BIPYRAMID_ON = False

View File

@@ -0,0 +1,4 @@
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
from .swap_align2nat import SwapAlign2Nat, swap_align2nat
__all__ = [k for k in globals().keys() if not k.startswith("_")]

View File

@@ -0,0 +1,54 @@
// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
#pragma once
#include <torch/types.h>
namespace tensormask {
#ifdef WITH_CUDA
at::Tensor SwapAlign2Nat_forward_cuda(
const at::Tensor& X,
const int lambda_val,
const float pad_val);
at::Tensor SwapAlign2Nat_backward_cuda(
const at::Tensor& gY,
const int lambda_val,
const int batch_size,
const int channel,
const int height,
const int width);
#endif
inline at::Tensor SwapAlign2Nat_forward(
const at::Tensor& X,
const int lambda_val,
const float pad_val) {
if (X.type().is_cuda()) {
#ifdef WITH_CUDA
return SwapAlign2Nat_forward_cuda(X, lambda_val, pad_val);
#else
AT_ERROR("Not compiled with GPU support");
#endif
}
AT_ERROR("Not implemented on the CPU");
}
inline at::Tensor SwapAlign2Nat_backward(
const at::Tensor& gY,
const int lambda_val,
const int batch_size,
const int channel,
const int height,
const int width) {
if (gY.type().is_cuda()) {
#ifdef WITH_CUDA
return SwapAlign2Nat_backward_cuda(
gY, lambda_val, batch_size, channel, height, width);
#else
AT_ERROR("Not compiled with GPU support");
#endif
}
AT_ERROR("Not implemented on the CPU");
}
} // namespace tensormask

View File

@@ -0,0 +1,526 @@
// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
#include <ATen/ATen.h>
#include <ATen/cuda/CUDAContext.h>
#include <c10/cuda/CUDAGuard.h>
#include <ATen/cuda/CUDAApplyUtils.cuh>
// TODO make it in a common file
#define CUDA_1D_KERNEL_LOOP(i, n) \
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; \
i += blockDim.x * gridDim.x)
template <typename T>
__device__ inline T get_pixel_val(
const T* tensor,
const int idx,
const int H,
const int W,
const int y,
const int x,
const int V,
const int U,
const int v,
const int u,
const T pad_val) {
if ((y < 0) || (y >= H) || (x < 0) || (x >= W) || (v < 0) || (v >= V) ||
(u < 0) || (u >= U)) {
return pad_val;
} else {
return tensor[(((idx * V + v) * U + u) * H + y) * W + x];
}
}
template <typename T>
__device__ inline void add_pixel_val(
T* tensor,
const T val,
const int idx,
const int H,
const int W,
const int y,
const int x,
const int V,
const int U,
const int v,
const int u) {
if ((val == 0.) || (y < 0) || (y >= H) || (x < 0) || (x >= W) || (v < 0) ||
(v >= V) || (u < 0) || (u >= U)) {
return;
} else {
atomicAdd(tensor + ((((idx * V + v) * U + u) * H + y) * W + x), val);
}
}
template <typename T>
__global__ void SwapAlign2NatForwardFeat(
const int nthreads,
const T* bottom_data,
const int Vout,
const int Uout,
const float hVout,
const float hUout,
const int Vin,
const int Uin,
const float lambda,
const int Hin,
const int Win,
const int Hout,
const int Wout,
const T pad_val,
T* top_data) {
CUDA_1D_KERNEL_LOOP(index, nthreads) {
int idx = index;
const int x = idx % Wout;
idx /= Wout;
const int y = idx % Hout;
idx /= Hout;
const int u = idx % Uout;
idx /= Uout;
const int v = idx % Vout;
idx /= Vout;
const float ox = x * lambda + u - hUout + 0.5;
const int xf = static_cast<int>(floor(ox));
const int xc = static_cast<int>(ceil(ox));
const float xwc = ox - xf;
const float xwf = 1. - xwc;
const float oy = y * lambda + v - hVout + 0.5;
const int yf = static_cast<int>(floor(oy));
const int yc = static_cast<int>(ceil(oy));
const float ywc = oy - yf;
const float ywf = 1. - ywc;
const float ou = (u + 0.5) / lambda - 0.5;
const int uf = static_cast<int>(floor(ou));
const int uc = static_cast<int>(ceil(ou));
const float uwc = ou - uf;
const float uwf = 1. - uwc;
const float ov = (v + 0.5) / lambda - 0.5;
const int vf = static_cast<int>(floor(ov));
const int vc = static_cast<int>(ceil(ov));
const float vwc = ov - vf;
const float vwf = 1. - vwc;
T val = ywf * xwf * vwf * uwf *
get_pixel_val(
bottom_data, idx, Hin, Win, yf, xf, Vin, Uin, vf, uf, pad_val) +
ywf * xwf * vwf * uwc *
get_pixel_val(
bottom_data, idx, Hin, Win, yf, xf, Vin, Uin, vf, uc, pad_val) +
ywf * xwf * vwc * uwf *
get_pixel_val(
bottom_data, idx, Hin, Win, yf, xf, Vin, Uin, vc, uf, pad_val) +
ywf * xwf * vwc * uwc *
get_pixel_val(
bottom_data, idx, Hin, Win, yf, xf, Vin, Uin, vc, uc, pad_val) +
ywf * xwc * vwf * uwf *
get_pixel_val(
bottom_data, idx, Hin, Win, yf, xc, Vin, Uin, vf, uf, pad_val) +
ywf * xwc * vwf * uwc *
get_pixel_val(
bottom_data, idx, Hin, Win, yf, xc, Vin, Uin, vf, uc, pad_val) +
ywf * xwc * vwc * uwf *
get_pixel_val(
bottom_data, idx, Hin, Win, yf, xc, Vin, Uin, vc, uf, pad_val) +
ywf * xwc * vwc * uwc *
get_pixel_val(
bottom_data, idx, Hin, Win, yf, xc, Vin, Uin, vc, uc, pad_val) +
ywc * xwf * vwf * uwf *
get_pixel_val(
bottom_data, idx, Hin, Win, yc, xf, Vin, Uin, vf, uf, pad_val) +
ywc * xwf * vwf * uwc *
get_pixel_val(
bottom_data, idx, Hin, Win, yc, xf, Vin, Uin, vf, uc, pad_val) +
ywc * xwf * vwc * uwf *
get_pixel_val(
bottom_data, idx, Hin, Win, yc, xf, Vin, Uin, vc, uf, pad_val) +
ywc * xwf * vwc * uwc *
get_pixel_val(
bottom_data, idx, Hin, Win, yc, xf, Vin, Uin, vc, uc, pad_val) +
ywc * xwc * vwf * uwf *
get_pixel_val(
bottom_data, idx, Hin, Win, yc, xc, Vin, Uin, vf, uf, pad_val) +
ywc * xwc * vwf * uwc *
get_pixel_val(
bottom_data, idx, Hin, Win, yc, xc, Vin, Uin, vf, uc, pad_val) +
ywc * xwc * vwc * uwf *
get_pixel_val(
bottom_data, idx, Hin, Win, yc, xc, Vin, Uin, vc, uf, pad_val) +
ywc * xwc * vwc * uwc *
get_pixel_val(
bottom_data, idx, Hin, Win, yc, xc, Vin, Uin, vc, uc, pad_val);
top_data[index] = val;
}
}
template <typename T>
__global__ void SwapAlign2NatBackwardFeat(
const int nthreads,
const T* top_diff,
const int Vout,
const int Uout,
const float hVout,
const float hUout,
const int Vin,
const int Uin,
const float lambda,
const int Hin,
const int Win,
const int Hout,
const int Wout,
T* bottom_diff) {
CUDA_1D_KERNEL_LOOP(index, nthreads) {
int idx = index;
const int x = idx % Wout;
idx /= Wout;
const int y = idx % Hout;
idx /= Hout;
const int u = idx % Uout;
idx /= Uout;
const int v = idx % Vout;
idx /= Vout;
const float ox = x * lambda + u - hUout + 0.5;
const int xf = static_cast<int>(floor(ox));
const int xc = static_cast<int>(ceil(ox));
const float xwc = ox - xf;
const float xwf = 1. - xwc;
const float oy = y * lambda + v - hVout + 0.5;
const int yf = static_cast<int>(floor(oy));
const int yc = static_cast<int>(ceil(oy));
const float ywc = oy - yf;
const float ywf = 1. - ywc;
const float ou = (u + 0.5) / lambda - 0.5;
const int uf = static_cast<int>(floor(ou));
const int uc = static_cast<int>(ceil(ou));
const float uwc = ou - uf;
const float uwf = 1. - uwc;
const float ov = (v + 0.5) / lambda - 0.5;
const int vf = static_cast<int>(floor(ov));
const int vc = static_cast<int>(ceil(ov));
const float vwc = ov - vf;
const float vwf = 1. - vwc;
const T grad = top_diff[index];
add_pixel_val(
bottom_diff,
ywf * xwf * vwf * uwf * grad,
idx,
Hin,
Win,
yf,
xf,
Vin,
Uin,
vf,
uf);
add_pixel_val(
bottom_diff,
ywf * xwf * vwf * uwc * grad,
idx,
Hin,
Win,
yf,
xf,
Vin,
Uin,
vf,
uc);
add_pixel_val(
bottom_diff,
ywf * xwf * vwc * uwf * grad,
idx,
Hin,
Win,
yf,
xf,
Vin,
Uin,
vc,
uf);
add_pixel_val(
bottom_diff,
ywf * xwf * vwc * uwc * grad,
idx,
Hin,
Win,
yf,
xf,
Vin,
Uin,
vc,
uc);
add_pixel_val(
bottom_diff,
ywf * xwc * vwf * uwf * grad,
idx,
Hin,
Win,
yf,
xc,
Vin,
Uin,
vf,
uf);
add_pixel_val(
bottom_diff,
ywf * xwc * vwf * uwc * grad,
idx,
Hin,
Win,
yf,
xc,
Vin,
Uin,
vf,
uc);
add_pixel_val(
bottom_diff,
ywf * xwc * vwc * uwf * grad,
idx,
Hin,
Win,
yf,
xc,
Vin,
Uin,
vc,
uf);
add_pixel_val(
bottom_diff,
ywf * xwc * vwc * uwc * grad,
idx,
Hin,
Win,
yf,
xc,
Vin,
Uin,
vc,
uc);
add_pixel_val(
bottom_diff,
ywc * xwf * vwf * uwf * grad,
idx,
Hin,
Win,
yc,
xf,
Vin,
Uin,
vf,
uf);
add_pixel_val(
bottom_diff,
ywc * xwf * vwf * uwc * grad,
idx,
Hin,
Win,
yc,
xf,
Vin,
Uin,
vf,
uc);
add_pixel_val(
bottom_diff,
ywc * xwf * vwc * uwf * grad,
idx,
Hin,
Win,
yc,
xf,
Vin,
Uin,
vc,
uf);
add_pixel_val(
bottom_diff,
ywc * xwf * vwc * uwc * grad,
idx,
Hin,
Win,
yc,
xf,
Vin,
Uin,
vc,
uc);
add_pixel_val(
bottom_diff,
ywc * xwc * vwf * uwf * grad,
idx,
Hin,
Win,
yc,
xc,
Vin,
Uin,
vf,
uf);
add_pixel_val(
bottom_diff,
ywc * xwc * vwf * uwc * grad,
idx,
Hin,
Win,
yc,
xc,
Vin,
Uin,
vf,
uc);
add_pixel_val(
bottom_diff,
ywc * xwc * vwc * uwf * grad,
idx,
Hin,
Win,
yc,
xc,
Vin,
Uin,
vc,
uf);
add_pixel_val(
bottom_diff,
ywc * xwc * vwc * uwc * grad,
idx,
Hin,
Win,
yc,
xc,
Vin,
Uin,
vc,
uc);
}
}
namespace tensormask {
at::Tensor SwapAlign2Nat_forward_cuda(
const at::Tensor& X,
const int lambda_val,
const float pad_val) {
AT_ASSERTM(X.device().is_cuda(), "input must be a CUDA tensor");
AT_ASSERTM(X.ndimension() == 4, "input must be a 4D tensor");
AT_ASSERTM(lambda_val >= 1, "lambda should be greater or equal to 1");
const int N = X.size(0);
const int C = X.size(1);
const int Vin = static_cast<int>(sqrt(static_cast<float>(C)));
const int Uin = C / Vin;
AT_ASSERTM(
C == Vin * Uin && Vin == Uin, "#channels should be a square number");
const int Vout = lambda_val * Vin;
const int Uout = lambda_val * Uin;
const int Hin = X.size(2);
const int Win = X.size(3);
const float lambda = static_cast<float>(lambda_val);
const int Hout = static_cast<int>(ceil(Hin / lambda));
const int Wout = static_cast<int>(ceil(Win / lambda));
const float hVout = Vout / 2.;
const float hUout = Uout / 2.;
at::cuda::CUDAGuard device_guard(X.device());
at::Tensor Y = at::empty({N, Vout * Uout, Hout, Wout}, X.options());
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
dim3 grid(std::min(at::cuda::ATenCeilDiv(Y.numel(), 512L), 4096L));
dim3 block(512);
if (Y.numel() == 0) {
AT_CUDA_CHECK(cudaGetLastError());
return Y;
}
auto X_ = X.contiguous();
AT_DISPATCH_FLOATING_TYPES(X.scalar_type(), "SwapAlign2Nat_forward", [&] {
SwapAlign2NatForwardFeat<scalar_t><<<grid, block, 0, stream>>>(
Y.numel(),
X_.data_ptr<scalar_t>(),
Vout,
Uout,
hVout,
hUout,
Vin,
Uin,
lambda,
Hin,
Win,
Hout,
Wout,
pad_val,
Y.data_ptr<scalar_t>());
});
cudaDeviceSynchronize();
AT_CUDA_CHECK(cudaGetLastError());
return Y;
}
at::Tensor SwapAlign2Nat_backward_cuda(
const at::Tensor& gY,
const int lambda_val,
const int batch_size,
const int channel,
const int height,
const int width) {
AT_ASSERTM(gY.device().is_cuda(), "input gradient must be a CUDA tensor");
AT_ASSERTM(gY.ndimension() == 4, "input gradient must be a 4D tensor");
AT_ASSERTM(lambda_val >= 1, "lambda should be greater or equal to 1");
const int Vin = static_cast<int>(sqrt(static_cast<float>(channel)));
const int Uin = channel / Vin;
const int Vout = lambda_val * Vin;
const int Uout = lambda_val * Uin;
const float hVout = Vout / 2.;
const float hUout = Uout / 2.;
const int Hout = gY.size(2);
const int Wout = gY.size(3);
at::cuda::CUDAGuard device_guard(gY.device());
at::Tensor gX = at::zeros({batch_size, channel, height, width}, gY.options());
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
dim3 grid(std::min(at::cuda::ATenCeilDiv(gY.numel(), 512L), 4096L));
dim3 block(512);
// handle possibly empty gradients
if (gY.numel() == 0) {
AT_CUDA_CHECK(cudaGetLastError());
return gX;
}
auto gY_ = gY.contiguous();
AT_DISPATCH_FLOATING_TYPES(gY.scalar_type(), "SwapAlign2Nat_backward", [&] {
SwapAlign2NatBackwardFeat<scalar_t><<<grid, block, 0, stream>>>(
gY.numel(),
gY_.data_ptr<scalar_t>(),
Vout,
Uout,
hVout,
hUout,
Vin,
Uin,
static_cast<float>(lambda_val),
height,
width,
Hout,
Wout,
gX.data_ptr<scalar_t>());
});
AT_CUDA_CHECK(cudaGetLastError());
return gX;
}
} // namespace tensormask

View File

@@ -0,0 +1,19 @@
// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
#include <torch/extension.h>
#include "SwapAlign2Nat/SwapAlign2Nat.h"
namespace tensormask {
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
m.def(
"swap_align2nat_forward",
&SwapAlign2Nat_forward,
"SwapAlign2Nat_forward");
m.def(
"swap_align2nat_backward",
&SwapAlign2Nat_backward,
"SwapAlign2Nat_backward");
}
} // namespace tensormask

View File

@@ -0,0 +1,61 @@
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
from torch import nn
from torch.autograd import Function
from torch.autograd.function import once_differentiable
from tensormask import _C
class _SwapAlign2Nat(Function):
@staticmethod
def forward(ctx, X, lambda_val, pad_val):
ctx.lambda_val = lambda_val
ctx.input_shape = X.size()
Y = _C.swap_align2nat_forward(X, lambda_val, pad_val)
return Y
@staticmethod
@once_differentiable
def backward(ctx, gY):
lambda_val = ctx.lambda_val
bs, ch, h, w = ctx.input_shape
gX = _C.swap_align2nat_backward(gY, lambda_val, bs, ch, h, w)
return gX, None, None
swap_align2nat = _SwapAlign2Nat.apply
class SwapAlign2Nat(nn.Module):
"""
The op `SwapAlign2Nat` described in https://arxiv.org/abs/1903.12174.
Given an input tensor that predicts masks of shape (N, C=VxU, H, W),
apply the op, it will return masks of shape (N, V'xU', H', W') where
the unit lengths of (V, U) and (H, W) are swapped, and the mask representation
is transformed from aligned to natural.
Args:
lambda_val (int): the relative unit length ratio between (V, U) and (H, W),
as we always have larger unit lengths for (V, U) than (H, W),
lambda_val is always >= 1.
pad_val (float): padding value for the values falling outside of the input
tensor, default set to -6 as sigmoid(-6) is ~0, indicating
that is no masks outside of the tensor.
"""
def __init__(self, lambda_val, pad_val=-6.0):
super(SwapAlign2Nat, self).__init__()
self.lambda_val = lambda_val
self.pad_val = pad_val
def forward(self, X):
return swap_align2nat(X, self.lambda_val, self.pad_val)
def __repr__(self):
tmpstr = self.__class__.__name__ + "("
tmpstr += "lambda_val=" + str(self.lambda_val)
tmpstr += ", pad_val=" + str(self.pad_val)
tmpstr += ")"
return tmpstr

View File

@@ -0,0 +1 @@
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved

View File

@@ -0,0 +1,32 @@
#!/usr/bin/env python3
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
import unittest
import torch
from torch.autograd import gradcheck
from tensormask.layers.swap_align2nat import SwapAlign2Nat
class SwapAlign2NatTest(unittest.TestCase):
@unittest.skipIf(not torch.cuda.is_available(), "CUDA not available")
def test_swap_align2nat_gradcheck_cuda(self):
dtype = torch.float64
device = torch.device("cuda")
m = SwapAlign2Nat(2).to(dtype=dtype, device=device)
x = torch.rand(2, 4, 10, 10, dtype=dtype, device=device, requires_grad=True)
self.assertTrue(gradcheck(m, x), "gradcheck failed for SwapAlign2Nat CUDA")
def _swap_align2nat(self, tensor, lambda_val):
"""
The basic setup for testing Swap_Align
"""
op = SwapAlign2Nat(lambda_val, pad_val=0.0)
input = torch.from_numpy(tensor[None, :, :, :].astype("float32"))
output = op.forward(input.cuda()).cpu().numpy()
return output[0]
if __name__ == "__main__":
unittest.main()

View File

@@ -0,0 +1,70 @@
#!/usr/bin/env python3
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
"""
TensorMask Training Script.
This script is a simplified version of the training script in detectron2/tools.
"""
import os
import detectron2.utils.comm as comm
from detectron2.checkpoint import DetectionCheckpointer
from detectron2.config import get_cfg
from detectron2.engine import DefaultTrainer, default_argument_parser, default_setup, launch
from detectron2.evaluation import COCOEvaluator, verify_results
from tensormask import add_tensormask_config
class Trainer(DefaultTrainer):
@classmethod
def build_evaluator(cls, cfg, dataset_name, output_folder=None):
if output_folder is None:
output_folder = os.path.join(cfg.OUTPUT_DIR, "inference")
return COCOEvaluator(dataset_name, cfg, True, output_folder)
def setup(args):
"""
Create configs and perform basic setups.
"""
cfg = get_cfg()
add_tensormask_config(cfg)
cfg.merge_from_file(args.config_file)
cfg.merge_from_list(args.opts)
cfg.freeze()
default_setup(cfg, args)
return cfg
def main(args):
cfg = setup(args)
if args.eval_only:
model = Trainer.build_model(cfg)
DetectionCheckpointer(model, save_dir=cfg.OUTPUT_DIR).resume_or_load(
cfg.MODEL.WEIGHTS, resume=args.resume
)
res = Trainer.test(cfg, model)
if comm.is_main_process():
verify_results(cfg, res)
return res
trainer = Trainer(cfg)
trainer.resume_or_load(resume=args.resume)
return trainer.train()
if __name__ == "__main__":
args = default_argument_parser().parse_args()
print("Command Line Args:", args)
launch(
main,
args.num_gpus,
num_machines=args.num_machines,
machine_rank=args.machine_rank,
dist_url=args.dist_url,
args=(args,),
)

View File

@@ -0,0 +1,60 @@
# TridentNet in Detectron2
**Scale-Aware Trident Networks for Object Detection**
Yanghao Li\*, Yuntao Chen\*, Naiyan Wang, Zhaoxiang Zhang
[[`TridentNet`](https://github.com/TuSimple/simpledet/tree/master/models/tridentnet)] [[`arXiv`](https://arxiv.org/abs/1901.01892)] [[`BibTeX`](#CitingTridentNet)]
<div align="center">
<img src="https://drive.google.com/uc?export=view&id=10THEPdIPmf3ooMyNzrfZbpWihEBvixwt" width="700px" />
</div>
In this repository, we implement TridentNet-Fast in Detectron2.
Trident Network (TridentNet) aims to generate scale-specific feature maps with a uniform representational power. We construct a parallel multi-branch architecture in which each branch shares the same transformation parameters but with different receptive fields. TridentNet-Fast is a fast approximation version of TridentNet that could achieve significant improvements without any additional parameters and computational cost.
## Training
To train a model, run
```bash
python /path/to/detectron2/projects/TridentNet/train_net.py --config-file <config.yaml>
```
For example, to launch end-to-end TridentNet training with ResNet-50 backbone on 8 GPUs,
one should execute:
```bash
python /path/to/detectron2/projects/TridentNet/train_net.py --config-file configs/tridentnet_fast_R_50_C4_1x.yaml --num-gpus 8
```
## Evaluation
Model evaluation can be done similarly:
```bash
python /path/to/detectron2/projects/TridentNet/train_net.py --config-file configs/tridentnet_fast_R_50_C4_1x.yaml --eval-only MODEL.WEIGHTS model.pth
```
## Results on MS-COCO in Detectron2
|Model|Backbone|Head|lr sched|AP|AP50|AP75|APs|APm|APl|download|
|-----|--------|----|--------|--|----|----|---|---|---|--------|
|Faster|R50-C4|C5-512ROI|1X|35.7|56.1|38.0|19.2|40.9|48.7|<a href="https://dl.fbaipublicfiles.com/detectron2/COCO-Detection/faster_rcnn_R_50_C4_1x/137257644/model_final_721ade.pkl">model</a>&nbsp;\|&nbsp;<a href="https://dl.fbaipublicfiles.com/detectron2/COCO-Detection/faster_rcnn_R_50_C4_1x/137257644/metrics.json">metrics</a>|
|TridentFast|R50-C4|C5-128ROI|1X|38.0|58.1|40.8|19.5|42.2|54.6|<a href="https://dl.fbaipublicfiles.com/detectron2/TridentNet/tridentnet_fast_R_50_C4_1x/148572687/model_final_756cda.pkl">model</a>&nbsp;\|&nbsp;<a href="https://dl.fbaipublicfiles.com/detectron2/TridentNet/tridentnet_fast_R_50_C4_1x/148572687/metrics.json">metrics</a>|
|Faster|R50-C4|C5-512ROI|3X|38.4|58.7|41.3|20.7|42.7|53.1|<a href="https://dl.fbaipublicfiles.com/detectron2/COCO-Detection/faster_rcnn_R_50_C4_3x/137849393/model_final_f97cb7.pkl">model</a>&nbsp;\|&nbsp;<a href="https://dl.fbaipublicfiles.com/detectron2/COCO-Detection/faster_rcnn_R_50_C4_3x/137849393/metrics.json">metrics</a>|
|TridentFast|R50-C4|C5-128ROI|3X|40.6|60.8|43.6|23.4|44.7|57.1|<a href="https://dl.fbaipublicfiles.com/detectron2/TridentNet/tridentnet_fast_R_50_C4_3x/148572287/model_final_e1027c.pkl">model</a>&nbsp;\|&nbsp;<a href="https://dl.fbaipublicfiles.com/detectron2/TridentNet/tridentnet_fast_R_50_C4_3x/148572287/metrics.json">metrics</a>|
|Faster|R101-C4|C5-512ROI|3X|41.1|61.4|44.0|22.2|45.5|55.9|<a href="https://dl.fbaipublicfiles.com/detectron2/COCO-Detection/faster_rcnn_R_101_C4_3x/138204752/model_final_298dad.pkl">model</a>&nbsp;\|&nbsp;<a href="https://dl.fbaipublicfiles.com/detectron2/COCO-Detection/faster_rcnn_R_101_C4_3x/138204752/metrics.json">metrics</a>|
|TridentFast|R101-C4|C5-128ROI|3X|43.6|63.4|47.0|24.3|47.8|60.0|<a href="https://dl.fbaipublicfiles.com/detectron2/TridentNet/tridentnet_fast_R_101_C4_3x/148572198/model_final_164568.pkl">model</a>&nbsp;\|&nbsp;<a href="https://dl.fbaipublicfiles.com/detectron2/TridentNet/tridentnet_fast_R_101_C4_3x/148572198/metrics.json">metrics</a>|
## <a name="CitingTridentNet"></a>Citing TridentNet
If you use TridentNet, please use the following BibTeX entry.
```
@InProceedings{li2019scale,
title={Scale-Aware Trident Networks for Object Detection},
author={Li, Yanghao and Chen, Yuntao and Wang, Naiyan and Zhang, Zhaoxiang},
journal={The International Conference on Computer Vision (ICCV)},
year={2019}
}
```

Some files were not shown because too many files have changed in this diff Show More