Add at new repo again

This commit is contained in:
Руслан Бундюк 2025-01-28 21:48:35 +00:00
commit 6e660ddb3c
564 changed files with 75575 additions and 0 deletions

15
requirements.txt Normal file
View File

@ -0,0 +1,15 @@
torch==2.4.0
torchvision==0.19.0
accelerate==0.31.0
diffusers==0.31.0
transformers==4.39.3
gradio==5.8.0
numpy==1.23.0
scikit-image==0.24.0
huggingface_hub==0.26.5
onnxruntime==1.20.1
opencv-python
matplotlib==3.8.3
einops==0.7.0
fastapi[all]

BIN
tests/imgs/garment.jpg Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 732 KiB

BIN
tests/imgs/person.jpg Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 26 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 672 KiB

55
tests/test.py Normal file
View File

@ -0,0 +1,55 @@
import requests
import base64
import os
from PIL import Image
import io
from datetime import datetime
def image_to_base64(image_path):
with open(image_path, "rb") as image_file:
return base64.b64encode(image_file.read()).decode()
def base64_to_image(base64_str):
image_data = base64.b64decode(base64_str)
return Image.open(io.BytesIO(image_data))
def save_results(generated_images, output_dir="results"):
os.makedirs(output_dir, exist_ok=True)
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
saved_paths = []
for idx, img_base64 in enumerate(generated_images):
img = base64_to_image(img_base64)
output_path = os.path.join(output_dir, f"result_{timestamp}_{idx}.png")
img.save(output_path)
saved_paths.append(output_path)
print(f"Saved image to {output_path}")
return saved_paths
data = {
"model_image": image_to_base64("imgs/person.jpg"),
"garment_image": image_to_base64("imgs/garment.jpg"),
"category": "Upper-body",
"resolution": "768x1024",
"n_steps": 30,
"image_scale": 2.0,
"num_images": 1
}
try:
response = requests.post("http://localhost:8001/try-on", json=data)
response.raise_for_status()
result = response.json()
if result["status"] == "success":
saved_files = save_results(result["generated_images"])
print(f"Successfully generated {len(saved_files)} images")
print(f"Seed used: {result['seed']}")
else:
print("Generation failed:", result.get("detail", "Unknown error"))
except requests.exceptions.RequestException as e:
print(f"Error making request: {e}")
except Exception as e:
print(f"Error processing results: {e}")

View File

@ -0,0 +1,68 @@
# Openpose
# Original from CMU https://github.com/CMU-Perceptual-Computing-Lab/openpose
# 2nd Edited by https://github.com/Hzzone/pytorch-openpose
# 3rd Edited by ControlNet
# 4th Edited by ControlNet (added face and correct hands)
import os
os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE"
import torch
import numpy as np
from . import util
from .wholebody import Wholebody
def draw_pose(pose, H, W):
bodies = pose['bodies']
faces = pose['faces']
hands = pose['hands']
candidate = bodies['candidate']
subset = bodies['subset']
canvas = np.zeros(shape=(H, W, 3), dtype=np.uint8)
canvas = util.draw_bodypose(canvas, candidate, subset)
canvas = util.draw_handpose(canvas, hands)
canvas = util.draw_facepose(canvas, faces)
return canvas
class DWposeDetector:
def __init__(self, model_root, device):
self.pose_estimation = Wholebody(model_root, device)
def __call__(self, oriImg):
oriImg = oriImg.copy()
H, W, C = oriImg.shape
with torch.no_grad():
candidate, subset = self.pose_estimation(oriImg)
nums, keys, locs = candidate.shape
candidate[..., 0] /= float(W)
candidate[..., 1] /= float(H)
body = candidate[:,:18].copy()
body = body.reshape(nums*18, locs)
ori_score = subset[:,:18].copy()
score = subset[:,:18].copy()
for i in range(len(score)):
for j in range(len(score[i])):
if score[i][j] > 0.3:
score[i][j] = int(18*i+j)
else:
score[i][j] = -1
un_visible = subset<0.3
candidate[un_visible] = -1
foot = candidate[:,18:24]
faces = candidate[:,24:92]
hands = candidate[:,92:113]
hands = np.vstack([hands, candidate[:,113:]])
bodies = dict(candidate=body, subset=score)
pose = dict(bodies=bodies, hands=hands, faces=faces)
return draw_pose(pose, H, W), body, ori_score, candidate

View File

@ -0,0 +1,125 @@
import cv2
import numpy as np
import onnxruntime
def nms(boxes, scores, nms_thr):
"""Single class NMS implemented in Numpy."""
x1 = boxes[:, 0]
y1 = boxes[:, 1]
x2 = boxes[:, 2]
y2 = boxes[:, 3]
areas = (x2 - x1 + 1) * (y2 - y1 + 1)
order = scores.argsort()[::-1]
keep = []
while order.size > 0:
i = order[0]
keep.append(i)
xx1 = np.maximum(x1[i], x1[order[1:]])
yy1 = np.maximum(y1[i], y1[order[1:]])
xx2 = np.minimum(x2[i], x2[order[1:]])
yy2 = np.minimum(y2[i], y2[order[1:]])
w = np.maximum(0.0, xx2 - xx1 + 1)
h = np.maximum(0.0, yy2 - yy1 + 1)
inter = w * h
ovr = inter / (areas[i] + areas[order[1:]] - inter)
inds = np.where(ovr <= nms_thr)[0]
order = order[inds + 1]
return keep
def multiclass_nms(boxes, scores, nms_thr, score_thr):
"""Multiclass NMS implemented in Numpy. Class-aware version."""
final_dets = []
num_classes = scores.shape[1]
for cls_ind in range(num_classes):
cls_scores = scores[:, cls_ind]
valid_score_mask = cls_scores > score_thr
if valid_score_mask.sum() == 0:
continue
else:
valid_scores = cls_scores[valid_score_mask]
valid_boxes = boxes[valid_score_mask]
keep = nms(valid_boxes, valid_scores, nms_thr)
if len(keep) > 0:
cls_inds = np.ones((len(keep), 1)) * cls_ind
dets = np.concatenate(
[valid_boxes[keep], valid_scores[keep, None], cls_inds], 1
)
final_dets.append(dets)
if len(final_dets) == 0:
return None
return np.concatenate(final_dets, 0)
def demo_postprocess(outputs, img_size, p6=False):
grids = []
expanded_strides = []
strides = [8, 16, 32] if not p6 else [8, 16, 32, 64]
hsizes = [img_size[0] // stride for stride in strides]
wsizes = [img_size[1] // stride for stride in strides]
for hsize, wsize, stride in zip(hsizes, wsizes, strides):
xv, yv = np.meshgrid(np.arange(wsize), np.arange(hsize))
grid = np.stack((xv, yv), 2).reshape(1, -1, 2)
grids.append(grid)
shape = grid.shape[:2]
expanded_strides.append(np.full((*shape, 1), stride))
grids = np.concatenate(grids, 1)
expanded_strides = np.concatenate(expanded_strides, 1)
outputs[..., :2] = (outputs[..., :2] + grids) * expanded_strides
outputs[..., 2:4] = np.exp(outputs[..., 2:4]) * expanded_strides
return outputs
def preprocess(img, input_size, swap=(2, 0, 1)):
if len(img.shape) == 3:
padded_img = np.ones((input_size[0], input_size[1], 3), dtype=np.uint8) * 114
else:
padded_img = np.ones(input_size, dtype=np.uint8) * 114
r = min(input_size[0] / img.shape[0], input_size[1] / img.shape[1])
resized_img = cv2.resize(
img,
(int(img.shape[1] * r), int(img.shape[0] * r)),
interpolation=cv2.INTER_LINEAR,
).astype(np.uint8)
padded_img[: int(img.shape[0] * r), : int(img.shape[1] * r)] = resized_img
padded_img = padded_img.transpose(swap)
padded_img = np.ascontiguousarray(padded_img, dtype=np.float32)
return padded_img, r
def inference_detector(session, oriImg):
input_shape = (640,640)
img, ratio = preprocess(oriImg, input_shape)
ort_inputs = {session.get_inputs()[0].name: img[None, :, :, :]}
output = session.run(None, ort_inputs)
predictions = demo_postprocess(output[0], input_shape)[0]
boxes = predictions[:, :4]
scores = predictions[:, 4:5] * predictions[:, 5:]
boxes_xyxy = np.ones_like(boxes)
boxes_xyxy[:, 0] = boxes[:, 0] - boxes[:, 2]/2.
boxes_xyxy[:, 1] = boxes[:, 1] - boxes[:, 3]/2.
boxes_xyxy[:, 2] = boxes[:, 0] + boxes[:, 2]/2.
boxes_xyxy[:, 3] = boxes[:, 1] + boxes[:, 3]/2.
boxes_xyxy /= ratio
dets = multiclass_nms(boxes_xyxy, scores, nms_thr=0.45, score_thr=0.1)
if dets is not None:
final_boxes, final_scores, final_cls_inds = dets[:, :4], dets[:, 4], dets[:, 5]
isscore = final_scores>0.3
iscat = final_cls_inds == 0
isbbox = [ i and j for (i, j) in zip(isscore, iscat)]
final_boxes = final_boxes[isbbox]
else:
final_boxes = np.array([])
return final_boxes

View File

@ -0,0 +1,360 @@
from typing import List, Tuple
import cv2
import numpy as np
import onnxruntime as ort
def preprocess(
img: np.ndarray, out_bbox, input_size: Tuple[int, int] = (192, 256)
) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
"""Do preprocessing for RTMPose model inference.
Args:
img (np.ndarray): Input image in shape.
input_size (tuple): Input image size in shape (w, h).
Returns:
tuple:
- resized_img (np.ndarray): Preprocessed image.
- center (np.ndarray): Center of image.
- scale (np.ndarray): Scale of image.
"""
# get shape of image
img_shape = img.shape[:2]
out_img, out_center, out_scale = [], [], []
if len(out_bbox) == 0:
out_bbox = [[0, 0, img_shape[1], img_shape[0]]]
for i in range(len(out_bbox)):
x0 = out_bbox[i][0]
y0 = out_bbox[i][1]
x1 = out_bbox[i][2]
y1 = out_bbox[i][3]
bbox = np.array([x0, y0, x1, y1])
# get center and scale
center, scale = bbox_xyxy2cs(bbox, padding=1.25)
# do affine transformation
resized_img, scale = top_down_affine(input_size, scale, center, img)
# normalize image
mean = np.array([123.675, 116.28, 103.53])
std = np.array([58.395, 57.12, 57.375])
resized_img = (resized_img - mean) / std
out_img.append(resized_img)
out_center.append(center)
out_scale.append(scale)
return out_img, out_center, out_scale
def inference(sess: ort.InferenceSession, img: np.ndarray) -> np.ndarray:
"""Inference RTMPose model.
Args:
sess (ort.InferenceSession): ONNXRuntime session.
img (np.ndarray): Input image in shape.
Returns:
outputs (np.ndarray): Output of RTMPose model.
"""
all_out = []
# build input
for i in range(len(img)):
input = [img[i].transpose(2, 0, 1)]
# build output
sess_input = {sess.get_inputs()[0].name: input}
sess_output = []
for out in sess.get_outputs():
sess_output.append(out.name)
# run model
outputs = sess.run(sess_output, sess_input)
all_out.append(outputs)
return all_out
def postprocess(outputs: List[np.ndarray],
model_input_size: Tuple[int, int],
center: Tuple[int, int],
scale: Tuple[int, int],
simcc_split_ratio: float = 2.0
) -> Tuple[np.ndarray, np.ndarray]:
"""Postprocess for RTMPose model output.
Args:
outputs (np.ndarray): Output of RTMPose model.
model_input_size (tuple): RTMPose model Input image size.
center (tuple): Center of bbox in shape (x, y).
scale (tuple): Scale of bbox in shape (w, h).
simcc_split_ratio (float): Split ratio of simcc.
Returns:
tuple:
- keypoints (np.ndarray): Rescaled keypoints.
- scores (np.ndarray): Model predict scores.
"""
all_key = []
all_score = []
for i in range(len(outputs)):
# use simcc to decode
simcc_x, simcc_y = outputs[i]
keypoints, scores = decode(simcc_x, simcc_y, simcc_split_ratio)
# rescale keypoints
keypoints = keypoints / model_input_size * scale[i] + center[i] - scale[i] / 2
all_key.append(keypoints[0])
all_score.append(scores[0])
return np.array(all_key), np.array(all_score)
def bbox_xyxy2cs(bbox: np.ndarray,
padding: float = 1.) -> Tuple[np.ndarray, np.ndarray]:
"""Transform the bbox format from (x,y,w,h) into (center, scale)
Args:
bbox (ndarray): Bounding box(es) in shape (4,) or (n, 4), formatted
as (left, top, right, bottom)
padding (float): BBox padding factor that will be multilied to scale.
Default: 1.0
Returns:
tuple: A tuple containing center and scale.
- np.ndarray[float32]: Center (x, y) of the bbox in shape (2,) or
(n, 2)
- np.ndarray[float32]: Scale (w, h) of the bbox in shape (2,) or
(n, 2)
"""
# convert single bbox from (4, ) to (1, 4)
dim = bbox.ndim
if dim == 1:
bbox = bbox[None, :]
# get bbox center and scale
x1, y1, x2, y2 = np.hsplit(bbox, [1, 2, 3])
center = np.hstack([x1 + x2, y1 + y2]) * 0.5
scale = np.hstack([x2 - x1, y2 - y1]) * padding
if dim == 1:
center = center[0]
scale = scale[0]
return center, scale
def _fix_aspect_ratio(bbox_scale: np.ndarray,
aspect_ratio: float) -> np.ndarray:
"""Extend the scale to match the given aspect ratio.
Args:
scale (np.ndarray): The image scale (w, h) in shape (2, )
aspect_ratio (float): The ratio of ``w/h``
Returns:
np.ndarray: The reshaped image scale in (2, )
"""
w, h = np.hsplit(bbox_scale, [1])
bbox_scale = np.where(w > h * aspect_ratio,
np.hstack([w, w / aspect_ratio]),
np.hstack([h * aspect_ratio, h]))
return bbox_scale
def _rotate_point(pt: np.ndarray, angle_rad: float) -> np.ndarray:
"""Rotate a point by an angle.
Args:
pt (np.ndarray): 2D point coordinates (x, y) in shape (2, )
angle_rad (float): rotation angle in radian
Returns:
np.ndarray: Rotated point in shape (2, )
"""
sn, cs = np.sin(angle_rad), np.cos(angle_rad)
rot_mat = np.array([[cs, -sn], [sn, cs]])
return rot_mat @ pt
def _get_3rd_point(a: np.ndarray, b: np.ndarray) -> np.ndarray:
"""To calculate the affine matrix, three pairs of points are required. This
function is used to get the 3rd point, given 2D points a & b.
The 3rd point is defined by rotating vector `a - b` by 90 degrees
anticlockwise, using b as the rotation center.
Args:
a (np.ndarray): The 1st point (x,y) in shape (2, )
b (np.ndarray): The 2nd point (x,y) in shape (2, )
Returns:
np.ndarray: The 3rd point.
"""
direction = a - b
c = b + np.r_[-direction[1], direction[0]]
return c
def get_warp_matrix(center: np.ndarray,
scale: np.ndarray,
rot: float,
output_size: Tuple[int, int],
shift: Tuple[float, float] = (0., 0.),
inv: bool = False) -> np.ndarray:
"""Calculate the affine transformation matrix that can warp the bbox area
in the input image to the output size.
Args:
center (np.ndarray[2, ]): Center of the bounding box (x, y).
scale (np.ndarray[2, ]): Scale of the bounding box
wrt [width, height].
rot (float): Rotation angle (degree).
output_size (np.ndarray[2, ] | list(2,)): Size of the
destination heatmaps.
shift (0-100%): Shift translation ratio wrt the width/height.
Default (0., 0.).
inv (bool): Option to inverse the affine transform direction.
(inv=False: src->dst or inv=True: dst->src)
Returns:
np.ndarray: A 2x3 transformation matrix
"""
shift = np.array(shift)
src_w = scale[0]
dst_w = output_size[0]
dst_h = output_size[1]
# compute transformation matrix
rot_rad = np.deg2rad(rot)
src_dir = _rotate_point(np.array([0., src_w * -0.5]), rot_rad)
dst_dir = np.array([0., dst_w * -0.5])
# get four corners of the src rectangle in the original image
src = np.zeros((3, 2), dtype=np.float32)
src[0, :] = center + scale * shift
src[1, :] = center + src_dir + scale * shift
src[2, :] = _get_3rd_point(src[0, :], src[1, :])
# get four corners of the dst rectangle in the input image
dst = np.zeros((3, 2), dtype=np.float32)
dst[0, :] = [dst_w * 0.5, dst_h * 0.5]
dst[1, :] = np.array([dst_w * 0.5, dst_h * 0.5]) + dst_dir
dst[2, :] = _get_3rd_point(dst[0, :], dst[1, :])
if inv:
warp_mat = cv2.getAffineTransform(np.float32(dst), np.float32(src))
else:
warp_mat = cv2.getAffineTransform(np.float32(src), np.float32(dst))
return warp_mat
def top_down_affine(input_size: dict, bbox_scale: dict, bbox_center: dict,
img: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
"""Get the bbox image as the model input by affine transform.
Args:
input_size (dict): The input size of the model.
bbox_scale (dict): The bbox scale of the img.
bbox_center (dict): The bbox center of the img.
img (np.ndarray): The original image.
Returns:
tuple: A tuple containing center and scale.
- np.ndarray[float32]: img after affine transform.
- np.ndarray[float32]: bbox scale after affine transform.
"""
w, h = input_size
warp_size = (int(w), int(h))
# reshape bbox to fixed aspect ratio
bbox_scale = _fix_aspect_ratio(bbox_scale, aspect_ratio=w / h)
# get the affine matrix
center = bbox_center
scale = bbox_scale
rot = 0
warp_mat = get_warp_matrix(center, scale, rot, output_size=(w, h))
# do affine transform
img = cv2.warpAffine(img, warp_mat, warp_size, flags=cv2.INTER_LINEAR)
return img, bbox_scale
def get_simcc_maximum(simcc_x: np.ndarray,
simcc_y: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
"""Get maximum response location and value from simcc representations.
Note:
instance number: N
num_keypoints: K
heatmap height: H
heatmap width: W
Args:
simcc_x (np.ndarray): x-axis SimCC in shape (K, Wx) or (N, K, Wx)
simcc_y (np.ndarray): y-axis SimCC in shape (K, Wy) or (N, K, Wy)
Returns:
tuple:
- locs (np.ndarray): locations of maximum heatmap responses in shape
(K, 2) or (N, K, 2)
- vals (np.ndarray): values of maximum heatmap responses in shape
(K,) or (N, K)
"""
N, K, Wx = simcc_x.shape
simcc_x = simcc_x.reshape(N * K, -1)
simcc_y = simcc_y.reshape(N * K, -1)
# get maximum value locations
x_locs = np.argmax(simcc_x, axis=1)
y_locs = np.argmax(simcc_y, axis=1)
locs = np.stack((x_locs, y_locs), axis=-1).astype(np.float32)
max_val_x = np.amax(simcc_x, axis=1)
max_val_y = np.amax(simcc_y, axis=1)
# get maximum value across x and y axis
mask = max_val_x > max_val_y
max_val_x[mask] = max_val_y[mask]
vals = max_val_x
locs[vals <= 0.] = -1
# reshape
locs = locs.reshape(N, K, 2)
vals = vals.reshape(N, K)
return locs, vals
def decode(simcc_x: np.ndarray, simcc_y: np.ndarray,
simcc_split_ratio) -> Tuple[np.ndarray, np.ndarray]:
"""Modulate simcc distribution with Gaussian.
Args:
simcc_x (np.ndarray[K, Wx]): model predicted simcc in x.
simcc_y (np.ndarray[K, Wy]): model predicted simcc in y.
simcc_split_ratio (int): The split ratio of simcc.
Returns:
tuple: A tuple containing center and scale.
- np.ndarray[float32]: keypoints in shape (K, 2) or (n, K, 2)
- np.ndarray[float32]: scores in shape (K,) or (n, K)
"""
keypoints, scores = get_simcc_maximum(simcc_x, simcc_y)
keypoints /= simcc_split_ratio
return keypoints, scores
def inference_pose(session, out_bbox, oriImg):
h, w = session.get_inputs()[0].shape[2:]
model_input_size = (w, h)
resized_img, center, scale = preprocess(oriImg, out_bbox, model_input_size)
outputs = inference(session, resized_img)
keypoints, scores = postprocess(outputs, model_input_size, center, scale)
return keypoints, scores

View File

@ -0,0 +1,297 @@
import math
import numpy as np
import matplotlib
import cv2
eps = 0.01
def smart_resize(x, s):
Ht, Wt = s
if x.ndim == 2:
Ho, Wo = x.shape
Co = 1
else:
Ho, Wo, Co = x.shape
if Co == 3 or Co == 1:
k = float(Ht + Wt) / float(Ho + Wo)
return cv2.resize(x, (int(Wt), int(Ht)), interpolation=cv2.INTER_AREA if k < 1 else cv2.INTER_LANCZOS4)
else:
return np.stack([smart_resize(x[:, :, i], s) for i in range(Co)], axis=2)
def smart_resize_k(x, fx, fy):
if x.ndim == 2:
Ho, Wo = x.shape
Co = 1
else:
Ho, Wo, Co = x.shape
Ht, Wt = Ho * fy, Wo * fx
if Co == 3 or Co == 1:
k = float(Ht + Wt) / float(Ho + Wo)
return cv2.resize(x, (int(Wt), int(Ht)), interpolation=cv2.INTER_AREA if k < 1 else cv2.INTER_LANCZOS4)
else:
return np.stack([smart_resize_k(x[:, :, i], fx, fy) for i in range(Co)], axis=2)
def padRightDownCorner(img, stride, padValue):
h = img.shape[0]
w = img.shape[1]
pad = 4 * [None]
pad[0] = 0 # up
pad[1] = 0 # left
pad[2] = 0 if (h % stride == 0) else stride - (h % stride) # down
pad[3] = 0 if (w % stride == 0) else stride - (w % stride) # right
img_padded = img
pad_up = np.tile(img_padded[0:1, :, :]*0 + padValue, (pad[0], 1, 1))
img_padded = np.concatenate((pad_up, img_padded), axis=0)
pad_left = np.tile(img_padded[:, 0:1, :]*0 + padValue, (1, pad[1], 1))
img_padded = np.concatenate((pad_left, img_padded), axis=1)
pad_down = np.tile(img_padded[-2:-1, :, :]*0 + padValue, (pad[2], 1, 1))
img_padded = np.concatenate((img_padded, pad_down), axis=0)
pad_right = np.tile(img_padded[:, -2:-1, :]*0 + padValue, (1, pad[3], 1))
img_padded = np.concatenate((img_padded, pad_right), axis=1)
return img_padded, pad
def transfer(model, model_weights):
transfered_model_weights = {}
for weights_name in model.state_dict().keys():
transfered_model_weights[weights_name] = model_weights['.'.join(weights_name.split('.')[1:])]
return transfered_model_weights
def draw_bodypose(canvas, candidate, subset):
H, W, C = canvas.shape
candidate = np.array(candidate)
subset = np.array(subset)
stickwidth = 4
limbSeq = [[2, 3], [2, 6], [3, 4], [4, 5], [6, 7], [7, 8], [2, 9], [9, 10], \
[10, 11], [2, 12], [12, 13], [13, 14], [2, 1], [1, 15], [15, 17], \
[1, 16], [16, 18], [3, 17], [6, 18]]
colors = [[255, 0, 0], [255, 85, 0], [255, 170, 0], [255, 255, 0], [170, 255, 0], [85, 255, 0], [0, 255, 0], \
[0, 255, 85], [0, 255, 170], [0, 255, 255], [0, 170, 255], [0, 85, 255], [0, 0, 255], [85, 0, 255], \
[170, 0, 255], [255, 0, 255], [255, 0, 170], [255, 0, 85]]
for i in range(17):
for n in range(len(subset)):
index = subset[n][np.array(limbSeq[i]) - 1]
if -1 in index:
continue
Y = candidate[index.astype(int), 0] * float(W)
X = candidate[index.astype(int), 1] * float(H)
mX = np.mean(X)
mY = np.mean(Y)
length = ((X[0] - X[1]) ** 2 + (Y[0] - Y[1]) ** 2) ** 0.5
angle = math.degrees(math.atan2(X[0] - X[1], Y[0] - Y[1]))
polygon = cv2.ellipse2Poly((int(mY), int(mX)), (int(length / 2), stickwidth), int(angle), 0, 360, 1)
cv2.fillConvexPoly(canvas, polygon, colors[i])
canvas = (canvas * 0.6).astype(np.uint8)
for i in range(18):
for n in range(len(subset)):
index = int(subset[n][i])
if index == -1:
continue
x, y = candidate[index][0:2]
x = int(x * W)
y = int(y * H)
cv2.circle(canvas, (int(x), int(y)), 4, colors[i], thickness=-1)
return canvas
def draw_handpose(canvas, all_hand_peaks):
H, W, C = canvas.shape
edges = [[0, 1], [1, 2], [2, 3], [3, 4], [0, 5], [5, 6], [6, 7], [7, 8], [0, 9], [9, 10], \
[10, 11], [11, 12], [0, 13], [13, 14], [14, 15], [15, 16], [0, 17], [17, 18], [18, 19], [19, 20]]
for peaks in all_hand_peaks:
peaks = np.array(peaks)
for ie, e in enumerate(edges):
x1, y1 = peaks[e[0]]
x2, y2 = peaks[e[1]]
x1 = int(x1 * W)
y1 = int(y1 * H)
x2 = int(x2 * W)
y2 = int(y2 * H)
if x1 > eps and y1 > eps and x2 > eps and y2 > eps:
cv2.line(canvas, (x1, y1), (x2, y2), matplotlib.colors.hsv_to_rgb([ie / float(len(edges)), 1.0, 1.0]) * 255, thickness=2)
for i, keyponit in enumerate(peaks):
x, y = keyponit
x = int(x * W)
y = int(y * H)
if x > eps and y > eps:
cv2.circle(canvas, (x, y), 4, (0, 0, 255), thickness=-1)
return canvas
def draw_facepose(canvas, all_lmks):
H, W, C = canvas.shape
for lmks in all_lmks:
lmks = np.array(lmks)
for lmk in lmks:
x, y = lmk
x = int(x * W)
y = int(y * H)
if x > eps and y > eps:
cv2.circle(canvas, (x, y), 3, (255, 255, 255), thickness=-1)
return canvas
# detect hand according to body pose keypoints
# please refer to https://github.com/CMU-Perceptual-Computing-Lab/openpose/blob/master/src/openpose/hand/handDetector.cpp
def handDetect(candidate, subset, oriImg):
# right hand: wrist 4, elbow 3, shoulder 2
# left hand: wrist 7, elbow 6, shoulder 5
ratioWristElbow = 0.33
detect_result = []
image_height, image_width = oriImg.shape[0:2]
for person in subset.astype(int):
# if any of three not detected
has_left = np.sum(person[[5, 6, 7]] == -1) == 0
has_right = np.sum(person[[2, 3, 4]] == -1) == 0
if not (has_left or has_right):
continue
hands = []
#left hand
if has_left:
left_shoulder_index, left_elbow_index, left_wrist_index = person[[5, 6, 7]]
x1, y1 = candidate[left_shoulder_index][:2]
x2, y2 = candidate[left_elbow_index][:2]
x3, y3 = candidate[left_wrist_index][:2]
hands.append([x1, y1, x2, y2, x3, y3, True])
# right hand
if has_right:
right_shoulder_index, right_elbow_index, right_wrist_index = person[[2, 3, 4]]
x1, y1 = candidate[right_shoulder_index][:2]
x2, y2 = candidate[right_elbow_index][:2]
x3, y3 = candidate[right_wrist_index][:2]
hands.append([x1, y1, x2, y2, x3, y3, False])
for x1, y1, x2, y2, x3, y3, is_left in hands:
# pos_hand = pos_wrist + ratio * (pos_wrist - pos_elbox) = (1 + ratio) * pos_wrist - ratio * pos_elbox
# handRectangle.x = posePtr[wrist*3] + ratioWristElbow * (posePtr[wrist*3] - posePtr[elbow*3]);
# handRectangle.y = posePtr[wrist*3+1] + ratioWristElbow * (posePtr[wrist*3+1] - posePtr[elbow*3+1]);
# const auto distanceWristElbow = getDistance(poseKeypoints, person, wrist, elbow);
# const auto distanceElbowShoulder = getDistance(poseKeypoints, person, elbow, shoulder);
# handRectangle.width = 1.5f * fastMax(distanceWristElbow, 0.9f * distanceElbowShoulder);
x = x3 + ratioWristElbow * (x3 - x2)
y = y3 + ratioWristElbow * (y3 - y2)
distanceWristElbow = math.sqrt((x3 - x2) ** 2 + (y3 - y2) ** 2)
distanceElbowShoulder = math.sqrt((x2 - x1) ** 2 + (y2 - y1) ** 2)
width = 1.5 * max(distanceWristElbow, 0.9 * distanceElbowShoulder)
# x-y refers to the center --> offset to topLeft point
# handRectangle.x -= handRectangle.width / 2.f;
# handRectangle.y -= handRectangle.height / 2.f;
x -= width / 2
y -= width / 2 # width = height
# overflow the image
if x < 0: x = 0
if y < 0: y = 0
width1 = width
width2 = width
if x + width > image_width: width1 = image_width - x
if y + width > image_height: width2 = image_height - y
width = min(width1, width2)
# the max hand box value is 20 pixels
if width >= 20:
detect_result.append([int(x), int(y), int(width), is_left])
'''
return value: [[x, y, w, True if left hand else False]].
width=height since the network require squared input.
x, y is the coordinate of top left
'''
return detect_result
# Written by Lvmin
def faceDetect(candidate, subset, oriImg):
# left right eye ear 14 15 16 17
detect_result = []
image_height, image_width = oriImg.shape[0:2]
for person in subset.astype(int):
has_head = person[0] > -1
if not has_head:
continue
has_left_eye = person[14] > -1
has_right_eye = person[15] > -1
has_left_ear = person[16] > -1
has_right_ear = person[17] > -1
if not (has_left_eye or has_right_eye or has_left_ear or has_right_ear):
continue
head, left_eye, right_eye, left_ear, right_ear = person[[0, 14, 15, 16, 17]]
width = 0.0
x0, y0 = candidate[head][:2]
if has_left_eye:
x1, y1 = candidate[left_eye][:2]
d = max(abs(x0 - x1), abs(y0 - y1))
width = max(width, d * 3.0)
if has_right_eye:
x1, y1 = candidate[right_eye][:2]
d = max(abs(x0 - x1), abs(y0 - y1))
width = max(width, d * 3.0)
if has_left_ear:
x1, y1 = candidate[left_ear][:2]
d = max(abs(x0 - x1), abs(y0 - y1))
width = max(width, d * 1.5)
if has_right_ear:
x1, y1 = candidate[right_ear][:2]
d = max(abs(x0 - x1), abs(y0 - y1))
width = max(width, d * 1.5)
x, y = x0, y0
x -= width
y -= width
if x < 0:
x = 0
if y < 0:
y = 0
width1 = width * 2
width2 = width * 2
if x + width > image_width:
width1 = image_width - x
if y + width > image_height:
width2 = image_height - y
width = min(width1, width2)
if width >= 20:
detect_result.append([int(x), int(y), int(width)])
return detect_result
# get max index of 2d array
def npmax(array):
arrayindex = array.argmax(1)
arrayvalue = array.max(1)
i = arrayvalue.argmax()
j = arrayindex[i]
return i, j

View File

@ -0,0 +1,46 @@
import cv2
import numpy as np
import os
import onnxruntime as ort
from .onnxdet import inference_detector
from .onnxpose import inference_pose
class Wholebody:
def __init__(self, model_root, device):
providers = ['CPUExecutionProvider'
] if device == 'cpu' else ['CUDAExecutionProvider']
onnx_det = os.path.join(model_root, 'dwpose/yolox_l.onnx')
onnx_pose = os.path.join(model_root, 'dwpose/dw-ll_ucoco_384.onnx')
self.session_det = ort.InferenceSession(path_or_bytes=onnx_det, providers=providers)
self.session_pose = ort.InferenceSession(path_or_bytes=onnx_pose, providers=providers)
def __call__(self, oriImg):
det_result = inference_detector(self.session_det, oriImg)
keypoints, scores = inference_pose(self.session_pose, det_result, oriImg)
keypoints_info = np.concatenate(
(keypoints, scores[..., None]), axis=-1)
# compute neck joint
neck = np.mean(keypoints_info[:, [5, 6]], axis=1)
# neck score when visualizing pred
neck[:, 2:4] = np.logical_and(
keypoints_info[:, 5, 2:4] > 0.3,
keypoints_info[:, 6, 2:4] > 0.3).astype(int)
new_keypoints_info = np.insert(
keypoints_info, 17, neck, axis=1)
mmpose_idx = [
17, 6, 8, 10, 7, 9, 12, 14, 16, 13, 15, 2, 1, 4, 3
]
openpose_idx = [
1, 2, 3, 4, 6, 7, 8, 9, 10, 12, 13, 14, 15, 16, 17
]
new_keypoints_info[:, openpose_idx] = \
new_keypoints_info[:, mmpose_idx]
keypoints_info = new_keypoints_info
keypoints, scores = keypoints_info[
..., :2], keypoints_info[..., 2]
return keypoints, scores

View File

@ -0,0 +1,201 @@
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
"""
@Author : Peike Li
@Contact : peike.li@yahoo.com
@File : datasets.py
@Time : 8/4/19 3:35 PM
@Desc :
@License : This source code is licensed under the license found in the
LICENSE file in the root directory of this source tree.
"""
import os
import numpy as np
import random
import torch
import cv2
from torch.utils import data
from utils.transforms import get_affine_transform
class LIPDataSet(data.Dataset):
def __init__(self, root, dataset, crop_size=[473, 473], scale_factor=0.25,
rotation_factor=30, ignore_label=255, transform=None):
self.root = root
self.aspect_ratio = crop_size[1] * 1.0 / crop_size[0]
self.crop_size = np.asarray(crop_size)
self.ignore_label = ignore_label
self.scale_factor = scale_factor
self.rotation_factor = rotation_factor
self.flip_prob = 0.5
self.transform = transform
self.dataset = dataset
list_path = os.path.join(self.root, self.dataset + '_id.txt')
train_list = [i_id.strip() for i_id in open(list_path)]
self.train_list = train_list
self.number_samples = len(self.train_list)
def __len__(self):
return self.number_samples
def _box2cs(self, box):
x, y, w, h = box[:4]
return self._xywh2cs(x, y, w, h)
def _xywh2cs(self, x, y, w, h):
center = np.zeros((2), dtype=np.float32)
center[0] = x + w * 0.5
center[1] = y + h * 0.5
if w > self.aspect_ratio * h:
h = w * 1.0 / self.aspect_ratio
elif w < self.aspect_ratio * h:
w = h * self.aspect_ratio
scale = np.array([w * 1.0, h * 1.0], dtype=np.float32)
return center, scale
def __getitem__(self, index):
train_item = self.train_list[index]
im_path = os.path.join(self.root, self.dataset + '_images', train_item + '.jpg')
parsing_anno_path = os.path.join(self.root, self.dataset + '_segmentations', train_item + '.png')
im = cv2.imread(im_path, cv2.IMREAD_COLOR)
h, w, _ = im.shape
parsing_anno = np.zeros((h, w), dtype=np.long)
# Get person center and scale
person_center, s = self._box2cs([0, 0, w - 1, h - 1])
r = 0
if self.dataset != 'test':
# Get pose annotation
parsing_anno = cv2.imread(parsing_anno_path, cv2.IMREAD_GRAYSCALE)
if self.dataset == 'train' or self.dataset == 'trainval':
sf = self.scale_factor
rf = self.rotation_factor
s = s * np.clip(np.random.randn() * sf + 1, 1 - sf, 1 + sf)
r = np.clip(np.random.randn() * rf, -rf * 2, rf * 2) if random.random() <= 0.6 else 0
if random.random() <= self.flip_prob:
im = im[:, ::-1, :]
parsing_anno = parsing_anno[:, ::-1]
person_center[0] = im.shape[1] - person_center[0] - 1
right_idx = [15, 17, 19]
left_idx = [14, 16, 18]
for i in range(0, 3):
right_pos = np.where(parsing_anno == right_idx[i])
left_pos = np.where(parsing_anno == left_idx[i])
parsing_anno[right_pos[0], right_pos[1]] = left_idx[i]
parsing_anno[left_pos[0], left_pos[1]] = right_idx[i]
trans = get_affine_transform(person_center, s, r, self.crop_size)
input = cv2.warpAffine(
im,
trans,
(int(self.crop_size[1]), int(self.crop_size[0])),
flags=cv2.INTER_LINEAR,
borderMode=cv2.BORDER_CONSTANT,
borderValue=(0, 0, 0))
if self.transform:
input = self.transform(input)
meta = {
'name': train_item,
'center': person_center,
'height': h,
'width': w,
'scale': s,
'rotation': r
}
if self.dataset == 'val' or self.dataset == 'test':
return input, meta
else:
label_parsing = cv2.warpAffine(
parsing_anno,
trans,
(int(self.crop_size[1]), int(self.crop_size[0])),
flags=cv2.INTER_NEAREST,
borderMode=cv2.BORDER_CONSTANT,
borderValue=(255))
label_parsing = torch.from_numpy(label_parsing)
return input, label_parsing, meta
class LIPDataValSet(data.Dataset):
def __init__(self, root, dataset='val', crop_size=[473, 473], transform=None, flip=False):
self.root = root
self.crop_size = crop_size
self.transform = transform
self.flip = flip
self.dataset = dataset
self.root = root
self.aspect_ratio = crop_size[1] * 1.0 / crop_size[0]
self.crop_size = np.asarray(crop_size)
list_path = os.path.join(self.root, self.dataset + '_id.txt')
val_list = [i_id.strip() for i_id in open(list_path)]
self.val_list = val_list
self.number_samples = len(self.val_list)
def __len__(self):
return len(self.val_list)
def _box2cs(self, box):
x, y, w, h = box[:4]
return self._xywh2cs(x, y, w, h)
def _xywh2cs(self, x, y, w, h):
center = np.zeros((2), dtype=np.float32)
center[0] = x + w * 0.5
center[1] = y + h * 0.5
if w > self.aspect_ratio * h:
h = w * 1.0 / self.aspect_ratio
elif w < self.aspect_ratio * h:
w = h * self.aspect_ratio
scale = np.array([w * 1.0, h * 1.0], dtype=np.float32)
return center, scale
def __getitem__(self, index):
val_item = self.val_list[index]
# Load training image
im_path = os.path.join(self.root, self.dataset + '_images', val_item + '.jpg')
im = cv2.imread(im_path, cv2.IMREAD_COLOR)
h, w, _ = im.shape
# Get person center and scale
person_center, s = self._box2cs([0, 0, w - 1, h - 1])
r = 0
trans = get_affine_transform(person_center, s, r, self.crop_size)
input = cv2.warpAffine(
im,
trans,
(int(self.crop_size[1]), int(self.crop_size[0])),
flags=cv2.INTER_LINEAR,
borderMode=cv2.BORDER_CONSTANT,
borderValue=(0, 0, 0))
input = self.transform(input)
flip_input = input.flip(dims=[-1])
if self.flip:
batch_input_im = torch.stack([input, flip_input])
else:
batch_input_im = input
meta = {
'name': val_item,
'center': person_center,
'height': h,
'width': w,
'scale': s,
'rotation': r
}
return batch_input_im, meta

View File

@ -0,0 +1,89 @@
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
"""
@Author : Peike Li
@Contact : peike.li@yahoo.com
@File : dataset.py
@Time : 8/30/19 9:12 PM
@Desc : Dataset Definition
@License : This source code is licensed under the license found in the
LICENSE file in the root directory of this source tree.
"""
import os
import pdb
import cv2
import numpy as np
from PIL import Image
from torch.utils import data
from utils.transforms import get_affine_transform
class SimpleFolderDataset(data.Dataset):
def __init__(self, root, input_size=[512, 512], transform=None):
self.root = root
self.input_size = input_size
self.transform = transform
self.aspect_ratio = input_size[1] * 1.0 / input_size[0]
self.input_size = np.asarray(input_size)
self.is_pil_image = False
if isinstance(root, Image.Image):
self.file_list = [root]
self.is_pil_image = True
elif os.path.isfile(root):
self.file_list = [os.path.basename(root)]
self.root = os.path.dirname(root)
else:
self.file_list = os.listdir(self.root)
def __len__(self):
return len(self.file_list)
def _box2cs(self, box):
x, y, w, h = box[:4]
return self._xywh2cs(x, y, w, h)
def _xywh2cs(self, x, y, w, h):
center = np.zeros((2), dtype=np.float32)
center[0] = x + w * 0.5
center[1] = y + h * 0.5
if w > self.aspect_ratio * h:
h = w * 1.0 / self.aspect_ratio
elif w < self.aspect_ratio * h:
w = h * self.aspect_ratio
scale = np.array([w, h], dtype=np.float32)
return center, scale
def __getitem__(self, index):
if self.is_pil_image:
img = np.asarray(self.file_list[index])[:, :, [2, 1, 0]]
else:
img_name = self.file_list[index]
img_path = os.path.join(self.root, img_name)
img = cv2.imread(img_path, cv2.IMREAD_COLOR)
h, w, _ = img.shape
# Get person center and scale
person_center, s = self._box2cs([0, 0, w - 1, h - 1])
r = 0
trans = get_affine_transform(person_center, s, r, self.input_size)
input = cv2.warpAffine(
img,
trans,
(int(self.input_size[1]), int(self.input_size[0])),
flags=cv2.INTER_LINEAR,
borderMode=cv2.BORDER_CONSTANT,
borderValue=(0, 0, 0))
input = self.transform(input)
meta = {
'center': person_center,
'height': h,
'width': w,
'scale': s,
'rotation': r
}
return input, meta

View File

@ -0,0 +1,40 @@
import torch
from torch.nn import functional as F
def generate_edge_tensor(label, edge_width=3):
label = label.type(torch.cuda.FloatTensor)
if len(label.shape) == 2:
label = label.unsqueeze(0)
n, h, w = label.shape
edge = torch.zeros(label.shape, dtype=torch.float).cuda()
# right
edge_right = edge[:, 1:h, :]
edge_right[(label[:, 1:h, :] != label[:, :h - 1, :]) & (label[:, 1:h, :] != 255)
& (label[:, :h - 1, :] != 255)] = 1
# up
edge_up = edge[:, :, :w - 1]
edge_up[(label[:, :, :w - 1] != label[:, :, 1:w])
& (label[:, :, :w - 1] != 255)
& (label[:, :, 1:w] != 255)] = 1
# upright
edge_upright = edge[:, :h - 1, :w - 1]
edge_upright[(label[:, :h - 1, :w - 1] != label[:, 1:h, 1:w])
& (label[:, :h - 1, :w - 1] != 255)
& (label[:, 1:h, 1:w] != 255)] = 1
# bottomright
edge_bottomright = edge[:, :h - 1, 1:w]
edge_bottomright[(label[:, :h - 1, 1:w] != label[:, 1:h, :w - 1])
& (label[:, :h - 1, 1:w] != 255)
& (label[:, 1:h, :w - 1] != 255)] = 1
kernel = torch.ones((1, 1, edge_width, edge_width), dtype=torch.float).cuda()
with torch.no_grad():
edge = edge.unsqueeze(1)
edge = F.conv2d(edge, kernel, stride=1, padding=1)
edge[edge!=0] = 1
edge = edge.squeeze()
return edge

View File

@ -0,0 +1,166 @@
import argparse
import datetime
import json
import os
from PIL import Image
import numpy as np
import pycococreatortools
def get_arguments():
parser = argparse.ArgumentParser(description="transform mask annotation to coco annotation")
parser.add_argument("--dataset", type=str, default='CIHP', help="name of dataset (CIHP, MHPv2 or VIP)")
parser.add_argument("--json_save_dir", type=str, default='../data/msrcnn_finetune_annotations',
help="path to save coco-style annotation json file")
parser.add_argument("--use_val", type=bool, default=False,
help="use train+val set for finetuning or not")
parser.add_argument("--train_img_dir", type=str, default='../data/instance-level_human_parsing/Training/Images',
help="train image path")
parser.add_argument("--train_anno_dir", type=str,
default='../data/instance-level_human_parsing/Training/Human_ids',
help="train human mask path")
parser.add_argument("--val_img_dir", type=str, default='../data/instance-level_human_parsing/Validation/Images',
help="val image path")
parser.add_argument("--val_anno_dir", type=str,
default='../data/instance-level_human_parsing/Validation/Human_ids',
help="val human mask path")
return parser.parse_args()
def main(args):
INFO = {
"description": args.split_name + " Dataset",
"url": "",
"version": "",
"year": 2019,
"contributor": "xyq",
"date_created": datetime.datetime.utcnow().isoformat(' ')
}
LICENSES = [
{
"id": 1,
"name": "",
"url": ""
}
]
CATEGORIES = [
{
'id': 1,
'name': 'person',
'supercategory': 'person',
},
]
coco_output = {
"info": INFO,
"licenses": LICENSES,
"categories": CATEGORIES,
"images": [],
"annotations": []
}
image_id = 1
segmentation_id = 1
for image_name in os.listdir(args.train_img_dir):
image = Image.open(os.path.join(args.train_img_dir, image_name))
image_info = pycococreatortools.create_image_info(
image_id, image_name, image.size
)
coco_output["images"].append(image_info)
human_mask_name = os.path.splitext(image_name)[0] + '.png'
human_mask = np.asarray(Image.open(os.path.join(args.train_anno_dir, human_mask_name)))
human_gt_labels = np.unique(human_mask)
for i in range(1, len(human_gt_labels)):
category_info = {'id': 1, 'is_crowd': 0}
binary_mask = np.uint8(human_mask == i)
annotation_info = pycococreatortools.create_annotation_info(
segmentation_id, image_id, category_info, binary_mask,
image.size, tolerance=10
)
if annotation_info is not None:
coco_output["annotations"].append(annotation_info)
segmentation_id += 1
image_id += 1
if not os.path.exists(args.json_save_dir):
os.makedirs(args.json_save_dir)
if not args.use_val:
with open('{}/{}_train.json'.format(args.json_save_dir, args.split_name), 'w') as output_json_file:
json.dump(coco_output, output_json_file)
else:
for image_name in os.listdir(args.val_img_dir):
image = Image.open(os.path.join(args.val_img_dir, image_name))
image_info = pycococreatortools.create_image_info(
image_id, image_name, image.size
)
coco_output["images"].append(image_info)
human_mask_name = os.path.splitext(image_name)[0] + '.png'
human_mask = np.asarray(Image.open(os.path.join(args.val_anno_dir, human_mask_name)))
human_gt_labels = np.unique(human_mask)
for i in range(1, len(human_gt_labels)):
category_info = {'id': 1, 'is_crowd': 0}
binary_mask = np.uint8(human_mask == i)
annotation_info = pycococreatortools.create_annotation_info(
segmentation_id, image_id, category_info, binary_mask,
image.size, tolerance=10
)
if annotation_info is not None:
coco_output["annotations"].append(annotation_info)
segmentation_id += 1
image_id += 1
with open('{}/{}_trainval.json'.format(args.json_save_dir, args.split_name), 'w') as output_json_file:
json.dump(coco_output, output_json_file)
coco_output_val = {
"info": INFO,
"licenses": LICENSES,
"categories": CATEGORIES,
"images": [],
"annotations": []
}
image_id_val = 1
segmentation_id_val = 1
for image_name in os.listdir(args.val_img_dir):
image = Image.open(os.path.join(args.val_img_dir, image_name))
image_info = pycococreatortools.create_image_info(
image_id_val, image_name, image.size
)
coco_output_val["images"].append(image_info)
human_mask_name = os.path.splitext(image_name)[0] + '.png'
human_mask = np.asarray(Image.open(os.path.join(args.val_anno_dir, human_mask_name)))
human_gt_labels = np.unique(human_mask)
for i in range(1, len(human_gt_labels)):
category_info = {'id': 1, 'is_crowd': 0}
binary_mask = np.uint8(human_mask == i)
annotation_info = pycococreatortools.create_annotation_info(
segmentation_id_val, image_id_val, category_info, binary_mask,
image.size, tolerance=10
)
if annotation_info is not None:
coco_output_val["annotations"].append(annotation_info)
segmentation_id_val += 1
image_id_val += 1
with open('{}/{}_val.json'.format(args.json_save_dir, args.split_name), 'w') as output_json_file_val:
json.dump(coco_output_val, output_json_file_val)
if __name__ == "__main__":
args = get_arguments()
main(args)

View File

@ -0,0 +1,114 @@
import re
import datetime
import numpy as np
from itertools import groupby
from skimage import measure
from PIL import Image
from pycocotools import mask
convert = lambda text: int(text) if text.isdigit() else text.lower()
natrual_key = lambda key: [convert(c) for c in re.split('([0-9]+)', key)]
def resize_binary_mask(array, new_size):
image = Image.fromarray(array.astype(np.uint8) * 255)
image = image.resize(new_size)
return np.asarray(image).astype(np.bool_)
def close_contour(contour):
if not np.array_equal(contour[0], contour[-1]):
contour = np.vstack((contour, contour[0]))
return contour
def binary_mask_to_rle(binary_mask):
rle = {'counts': [], 'size': list(binary_mask.shape)}
counts = rle.get('counts')
for i, (value, elements) in enumerate(groupby(binary_mask.ravel(order='F'))):
if i == 0 and value == 1:
counts.append(0)
counts.append(len(list(elements)))
return rle
def binary_mask_to_polygon(binary_mask, tolerance=0):
"""Converts a binary mask to COCO polygon representation
Args:
binary_mask: a 2D binary numpy array where '1's represent the object
tolerance: Maximum distance from original points of polygon to approximated
polygonal chain. If tolerance is 0, the original coordinate array is returned.
"""
polygons = []
# pad mask to close contours of shapes which start and end at an edge
padded_binary_mask = np.pad(binary_mask, pad_width=1, mode='constant', constant_values=0)
contours = measure.find_contours(padded_binary_mask, 0.5)
contours = np.subtract(contours, 1)
for contour in contours:
contour = close_contour(contour)
contour = measure.approximate_polygon(contour, tolerance)
if len(contour) < 3:
continue
contour = np.flip(contour, axis=1)
segmentation = contour.ravel().tolist()
# after padding and subtracting 1 we may get -0.5 points in our segmentation
segmentation = [0 if i < 0 else i for i in segmentation]
polygons.append(segmentation)
return polygons
def create_image_info(image_id, file_name, image_size,
date_captured=datetime.datetime.utcnow().isoformat(' '),
license_id=1, coco_url="", flickr_url=""):
image_info = {
"id": image_id,
"file_name": file_name,
"width": image_size[0],
"height": image_size[1],
"date_captured": date_captured,
"license": license_id,
"coco_url": coco_url,
"flickr_url": flickr_url
}
return image_info
def create_annotation_info(annotation_id, image_id, category_info, binary_mask,
image_size=None, tolerance=2, bounding_box=None):
if image_size is not None:
binary_mask = resize_binary_mask(binary_mask, image_size)
binary_mask_encoded = mask.encode(np.asfortranarray(binary_mask.astype(np.uint8)))
area = mask.area(binary_mask_encoded)
if area < 1:
return None
if bounding_box is None:
bounding_box = mask.toBbox(binary_mask_encoded)
if category_info["is_crowd"]:
is_crowd = 1
segmentation = binary_mask_to_rle(binary_mask)
else:
is_crowd = 0
segmentation = binary_mask_to_polygon(binary_mask, tolerance)
if not segmentation:
return None
annotation_info = {
"id": annotation_id,
"image_id": image_id,
"category_id": category_info["id"],
"iscrowd": is_crowd,
"area": area.tolist(),
"bbox": bounding_box.tolist(),
"segmentation": segmentation,
"width": binary_mask.shape[1],
"height": binary_mask.shape[0],
}
return annotation_info

View File

@ -0,0 +1,74 @@
import argparse
import datetime
import json
import os
from PIL import Image
import pycococreatortools
def get_arguments():
parser = argparse.ArgumentParser(description="transform mask annotation to coco annotation")
parser.add_argument("--dataset", type=str, default='CIHP', help="name of dataset (CIHP, MHPv2 or VIP)")
parser.add_argument("--json_save_dir", type=str, default='../data/CIHP/annotations',
help="path to save coco-style annotation json file")
parser.add_argument("--test_img_dir", type=str, default='../data/CIHP/Testing/Images',
help="test image path")
return parser.parse_args()
args = get_arguments()
INFO = {
"description": args.dataset + "Dataset",
"url": "",
"version": "",
"year": 2020,
"contributor": "yunqiuxu",
"date_created": datetime.datetime.utcnow().isoformat(' ')
}
LICENSES = [
{
"id": 1,
"name": "",
"url": ""
}
]
CATEGORIES = [
{
'id': 1,
'name': 'person',
'supercategory': 'person',
},
]
def main(args):
coco_output = {
"info": INFO,
"licenses": LICENSES,
"categories": CATEGORIES,
"images": [],
"annotations": []
}
image_id = 1
for image_name in os.listdir(args.test_img_dir):
image = Image.open(os.path.join(args.test_img_dir, image_name))
image_info = pycococreatortools.create_image_info(
image_id, image_name, image.size
)
coco_output["images"].append(image_info)
image_id += 1
if not os.path.exists(os.path.join(args.json_save_dir)):
os.mkdir(os.path.join(args.json_save_dir))
with open('{}/{}.json'.format(args.json_save_dir, args.dataset), 'w') as output_json_file:
json.dump(coco_output, output_json_file)
if __name__ == "__main__":
main(args)

View File

@ -0,0 +1,179 @@
# Python CircleCI 2.0 configuration file
#
# Check https://circleci.com/docs/2.0/language-python/ for more details
#
version: 2
# -------------------------------------------------------------------------------------
# Environments to run the jobs in
# -------------------------------------------------------------------------------------
cpu: &cpu
docker:
- image: circleci/python:3.6.8-stretch
resource_class: medium
gpu: &gpu
machine:
image: ubuntu-1604:201903-01
docker_layer_caching: true
resource_class: gpu.small
# -------------------------------------------------------------------------------------
# Re-usable commands
# -------------------------------------------------------------------------------------
install_python: &install_python
- run:
name: Install Python
working_directory: ~/
command: |
pyenv install 3.6.1
pyenv global 3.6.1
setup_venv: &setup_venv
- run:
name: Setup Virtual Env
working_directory: ~/
command: |
python -m venv ~/venv
echo ". ~/venv/bin/activate" >> $BASH_ENV
. ~/venv/bin/activate
python --version
which python
which pip
pip install --upgrade pip
install_dep: &install_dep
- run:
name: Install Dependencies
command: |
pip install --progress-bar off -U 'git+https://github.com/facebookresearch/fvcore'
pip install --progress-bar off cython opencv-python
pip install --progress-bar off 'git+https://github.com/cocodataset/cocoapi.git#subdirectory=PythonAPI'
pip install --progress-bar off torch torchvision
install_detectron2: &install_detectron2
- run:
name: Install Detectron2
command: |
gcc --version
pip install -U --progress-bar off -e .[dev]
python -m detectron2.utils.collect_env
install_nvidia_driver: &install_nvidia_driver
- run:
name: Install nvidia driver
working_directory: ~/
command: |
wget -q 'https://s3.amazonaws.com/ossci-linux/nvidia_driver/NVIDIA-Linux-x86_64-430.40.run'
sudo /bin/bash ./NVIDIA-Linux-x86_64-430.40.run -s --no-drm
nvidia-smi
run_unittests: &run_unittests
- run:
name: Run Unit Tests
command: |
python -m unittest discover -v -s tests
# -------------------------------------------------------------------------------------
# Jobs to run
# -------------------------------------------------------------------------------------
jobs:
cpu_tests:
<<: *cpu
working_directory: ~/detectron2
steps:
- checkout
- <<: *setup_venv
# Cache the venv directory that contains dependencies
- restore_cache:
keys:
- cache-key-{{ .Branch }}-ID-20200425
- <<: *install_dep
- save_cache:
paths:
- ~/venv
key: cache-key-{{ .Branch }}-ID-20200425
- <<: *install_detectron2
- run:
name: isort
command: |
isort -c -sp .
- run:
name: black
command: |
black --check -l 100 .
- run:
name: flake8
command: |
flake8 .
- <<: *run_unittests
gpu_tests:
<<: *gpu
working_directory: ~/detectron2
steps:
- checkout
- <<: *install_nvidia_driver
- run:
name: Install nvidia-docker
working_directory: ~/
command: |
curl -s -L https://nvidia.github.io/nvidia-docker/gpgkey | sudo apt-key add -
distribution=$(. /etc/os-release;echo $ID$VERSION_ID)
curl -s -L https://nvidia.github.io/nvidia-docker/$distribution/nvidia-docker.list | \
sudo tee /etc/apt/sources.list.d/nvidia-docker.list
sudo apt-get update && sudo apt-get install -y nvidia-docker2
# reload the docker daemon configuration
sudo pkill -SIGHUP dockerd
- run:
name: Launch docker
working_directory: ~/detectron2/docker
command: |
nvidia-docker build -t detectron2:v0 -f Dockerfile-circleci .
nvidia-docker run -itd --name d2 detectron2:v0
docker exec -it d2 nvidia-smi
- run:
name: Build Detectron2
command: |
docker exec -it d2 pip install 'git+https://github.com/facebookresearch/fvcore'
docker cp ~/detectron2 d2:/detectron2
# This will build d2 for the target GPU arch only
docker exec -it d2 pip install -e /detectron2
docker exec -it d2 python3 -m detectron2.utils.collect_env
docker exec -it d2 python3 -c 'import torch; assert(torch.cuda.is_available())'
- run:
name: Run Unit Tests
command: |
docker exec -e CIRCLECI=true -it d2 python3 -m unittest discover -v -s /detectron2/tests
workflows:
version: 2
regular_test:
jobs:
- cpu_tests
- gpu_tests
#nightly_test:
#jobs:
#- gpu_tests
#triggers:
#- schedule:
#cron: "0 0 * * *"
#filters:
#branches:
#only:
#- master

View File

@ -0,0 +1,85 @@
AccessModifierOffset: -1
AlignAfterOpenBracket: AlwaysBreak
AlignConsecutiveAssignments: false
AlignConsecutiveDeclarations: false
AlignEscapedNewlinesLeft: true
AlignOperands: false
AlignTrailingComments: false
AllowAllParametersOfDeclarationOnNextLine: false
AllowShortBlocksOnASingleLine: false
AllowShortCaseLabelsOnASingleLine: false
AllowShortFunctionsOnASingleLine: Empty
AllowShortIfStatementsOnASingleLine: false
AllowShortLoopsOnASingleLine: false
AlwaysBreakAfterReturnType: None
AlwaysBreakBeforeMultilineStrings: true
AlwaysBreakTemplateDeclarations: true
BinPackArguments: false
BinPackParameters: false
BraceWrapping:
AfterClass: false
AfterControlStatement: false
AfterEnum: false
AfterFunction: false
AfterNamespace: false
AfterObjCDeclaration: false
AfterStruct: false
AfterUnion: false
BeforeCatch: false
BeforeElse: false
IndentBraces: false
BreakBeforeBinaryOperators: None
BreakBeforeBraces: Attach
BreakBeforeTernaryOperators: true
BreakConstructorInitializersBeforeComma: false
BreakAfterJavaFieldAnnotations: false
BreakStringLiterals: false
ColumnLimit: 80
CommentPragmas: '^ IWYU pragma:'
ConstructorInitializerAllOnOneLineOrOnePerLine: true
ConstructorInitializerIndentWidth: 4
ContinuationIndentWidth: 4
Cpp11BracedListStyle: true
DerivePointerAlignment: false
DisableFormat: false
ForEachMacros: [ FOR_EACH, FOR_EACH_ENUMERATE, FOR_EACH_KV, FOR_EACH_R, FOR_EACH_RANGE, ]
IncludeCategories:
- Regex: '^<.*\.h(pp)?>'
Priority: 1
- Regex: '^<.*'
Priority: 2
- Regex: '.*'
Priority: 3
IndentCaseLabels: true
IndentWidth: 2
IndentWrappedFunctionNames: false
KeepEmptyLinesAtTheStartOfBlocks: false
MacroBlockBegin: ''
MacroBlockEnd: ''
MaxEmptyLinesToKeep: 1
NamespaceIndentation: None
ObjCBlockIndentWidth: 2
ObjCSpaceAfterProperty: false
ObjCSpaceBeforeProtocolList: false
PenaltyBreakBeforeFirstCallParameter: 1
PenaltyBreakComment: 300
PenaltyBreakFirstLessLess: 120
PenaltyBreakString: 1000
PenaltyExcessCharacter: 1000000
PenaltyReturnTypeOnItsOwnLine: 200
PointerAlignment: Left
ReflowComments: true
SortIncludes: true
SpaceAfterCStyleCast: false
SpaceBeforeAssignmentOperators: true
SpaceBeforeParens: ControlStatements
SpaceInEmptyParentheses: false
SpacesBeforeTrailingComments: 1
SpacesInAngles: false
SpacesInContainerLiterals: true
SpacesInCStyleCastParentheses: false
SpacesInParentheses: false
SpacesInSquareBrackets: false
Standard: Cpp11
TabWidth: 8
UseTab: Never

View File

@ -0,0 +1,9 @@
# This is an example .flake8 config, used when developing *Black* itself.
# Keep in sync with setup.cfg which is used for source packages.
[flake8]
ignore = W503, E203, E221, C901, C408, E741
max-line-length = 100
max-complexity = 18
select = B,C,E,F,W,T4,B9
exclude = build,__init__.py

View File

@ -0,0 +1,5 @@
# Code of Conduct
Facebook has adopted a Code of Conduct that we expect project participants to adhere to.
Please read the [full text](https://code.fb.com/codeofconduct/)
so that you can understand what actions will and will not be tolerated.

View File

@ -0,0 +1,49 @@
# Contributing to detectron2
## Issues
We use GitHub issues to track public bugs and questions.
Please make sure to follow one of the
[issue templates](https://github.com/facebookresearch/detectron2/issues/new/choose)
when reporting any issues.
Facebook has a [bounty program](https://www.facebook.com/whitehat/) for the safe
disclosure of security bugs. In those cases, please go through the process
outlined on that page and do not file a public issue.
## Pull Requests
We actively welcome your pull requests.
However, if you're adding any significant features (e.g. > 50 lines), please
make sure to have a corresponding issue to discuss your motivation and proposals,
before sending a PR. We do not always accept new features, and we take the following
factors into consideration:
1. Whether the same feature can be achieved without modifying detectron2.
Detectron2 is designed so that you can implement many extensions from the outside, e.g.
those in [projects](https://github.com/facebookresearch/detectron2/tree/master/projects).
If some part is not as extensible, you can also bring up the issue to make it more extensible.
2. Whether the feature is potentially useful to a large audience, or only to a small portion of users.
3. Whether the proposed solution has a good design / interface.
4. Whether the proposed solution adds extra mental/practical overhead to users who don't
need such feature.
5. Whether the proposed solution breaks existing APIs.
When sending a PR, please do:
1. If a PR contains multiple orthogonal changes, split it to several PRs.
2. If you've added code that should be tested, add tests.
3. For PRs that need experiments (e.g. adding a new model or new methods),
you don't need to update model zoo, but do provide experiment results in the description of the PR.
4. If APIs are changed, update the documentation.
5. Make sure your code lints with `./dev/linter.sh`.
## Contributor License Agreement ("CLA")
In order to accept your pull request, we need you to submit a CLA. You only need
to do this once to work on any of Facebook's open source projects.
Complete your CLA here: <https://code.facebook.com/cla>
## License
By contributing to detectron2, you agree that your contributions will be licensed
under the LICENSE file in the root directory of this source tree.

File diff suppressed because one or more lines are too long

After

Width:  |  Height:  |  Size: 6.3 KiB

View File

@ -0,0 +1,5 @@
Please select an issue template from
https://github.com/facebookresearch/detectron2/issues/new/choose .
Otherwise your issue will be closed.

View File

@ -0,0 +1,36 @@
---
name: "🐛 Bugs"
about: Report bugs in detectron2
title: Please read & provide the following
---
## Instructions To Reproduce the 🐛 Bug:
1. what changes you made (`git diff`) or what code you wrote
```
<put diff or code here>
```
2. what exact command you run:
3. what you observed (including __full logs__):
```
<put logs here>
```
4. please simplify the steps as much as possible so they do not require additional resources to
run, such as a private dataset.
## Expected behavior:
If there are no obvious error in "what you observed" provided above,
please tell us the expected behavior.
## Environment:
Provide your environment information using the following command:
```
wget -nc -q https://github.com/facebookresearch/detectron2/raw/master/detectron2/utils/collect_env.py && python collect_env.py
```
If your issue looks like an installation issue / environment issue,
please first try to solve it yourself with the instructions in
https://detectron2.readthedocs.io/tutorials/install.html#common-installation-issues

View File

@ -0,0 +1,9 @@
# require an issue template to be chosen
blank_issues_enabled: false
# Unexpected behaviors & bugs are split to two templates.
# When they are one template, users think "it's not a bug" and don't choose the template.
#
# But the file name is still "unexpected-problems-bugs.md" so that old references
# to this issue template still works.
# It's ok since this template should be a superset of "bugs.md" (unexpected behaviors is a superset of bugs)

View File

@ -0,0 +1,31 @@
---
name: "\U0001F680Feature Request"
about: Submit a proposal/request for a new detectron2 feature
---
## 🚀 Feature
A clear and concise description of the feature proposal.
## Motivation & Examples
Tell us why the feature is useful.
Describe what the feature would look like, if it is implemented.
Best demonstrated using **code examples** in addition to words.
## Note
We only consider adding new features if they are relevant to many users.
If you request implementation of research papers --
we only consider papers that have enough significance and prevalance in the object detection field.
We do not take requests for most projects in the `projects/` directory,
because they are research code release that is mainly for other researchers to reproduce results.
Instead of adding features inside detectron2,
you can implement many features by [extending detectron2](https://detectron2.readthedocs.io/tutorials/extend.html).
The [projects/](https://github.com/facebookresearch/detectron2/tree/master/projects/) directory contains many of such examples.

View File

@ -0,0 +1,26 @@
---
name: "❓How to do something?"
about: How to do something using detectron2? What does an API do?
---
## ❓ How to do something using detectron2
Describe what you want to do, including:
1. what inputs you will provide, if any:
2. what outputs you are expecting:
## ❓ What does an API do and how to use it?
Please link to which API or documentation you're asking about from
https://detectron2.readthedocs.io/
NOTE:
1. Only general answers are provided.
If you want to ask about "why X did not work", please use the
[Unexpected behaviors](https://github.com/facebookresearch/detectron2/issues/new/choose) issue template.
2. About how to implement new models / new dataloader / new training logic, etc., check documentation first.
3. We do not answer general machine learning / computer vision questions that are not specific to detectron2, such as how a model works, how to improve your training/make it converge, or what algorithm/methods can be used to achieve X.

View File

@ -0,0 +1,45 @@
---
name: "Unexpected behaviors"
about: Run into unexpected behaviors when using detectron2
title: Please read & provide the following
---
If you do not know the root cause of the problem, and wish someone to help you, please
post according to this template:
## Instructions To Reproduce the Issue:
1. what changes you made (`git diff`) or what code you wrote
```
<put diff or code here>
```
2. what exact command you run:
3. what you observed (including __full logs__):
```
<put logs here>
```
4. please simplify the steps as much as possible so they do not require additional resources to
run, such as a private dataset.
## Expected behavior:
If there are no obvious error in "what you observed" provided above,
please tell us the expected behavior.
If you expect the model to converge / work better, note that we do not give suggestions
on how to train a new model.
Only in one of the two conditions we will help with it:
(1) You're unable to reproduce the results in detectron2 model zoo.
(2) It indicates a detectron2 bug.
## Environment:
Provide your environment information using the following command:
```
wget -nc -q https://github.com/facebookresearch/detectron2/raw/master/detectron2/utils/collect_env.py && python collect_env.py
```
If your issue looks like an installation issue / environment issue,
please first try to solve it yourself with the instructions in
https://detectron2.readthedocs.io/tutorials/install.html#common-installation-issues

View File

@ -0,0 +1,9 @@
Thanks for your contribution!
If you're sending a large PR (e.g., >50 lines),
please open an issue first about the feature / bug, and indicate how you want to contribute.
Before submitting a PR, please run `dev/linter.sh` to lint the code.
See https://detectron2.readthedocs.io/notes/contributing.html#pull-requests
about how we handle PRs.

View File

@ -0,0 +1,46 @@
# output dir
output
instant_test_output
inference_test_output
*.jpg
*.png
*.txt
*.json
*.diff
# compilation and distribution
__pycache__
_ext
*.pyc
*.so
detectron2.egg-info/
build/
dist/
wheels/
# pytorch/python/numpy formats
*.pth
*.pkl
*.npy
# ipython/jupyter notebooks
*.ipynb
**/.ipynb_checkpoints/
# Editor temporaries
*.swn
*.swo
*.swp
*~
# editor settings
.idea
.vscode
# project dirs
/detectron2/model_zoo/configs
/datasets
/projects/*/datasets
/models

View File

@ -0,0 +1,79 @@
## Getting Started with Detectron2
This document provides a brief intro of the usage of builtin command-line tools in detectron2.
For a tutorial that involves actual coding with the API,
see our [Colab Notebook](https://colab.research.google.com/drive/16jcaJoc6bCFAQ96jDe2HwtXj7BMD_-m5)
which covers how to run inference with an
existing model, and how to train a builtin model on a custom dataset.
For more advanced tutorials, refer to our [documentation](https://detectron2.readthedocs.io/tutorials/extend.html).
### Inference Demo with Pre-trained Models
1. Pick a model and its config file from
[model zoo](MODEL_ZOO.md),
for example, `mask_rcnn_R_50_FPN_3x.yaml`.
2. We provide `demo.py` that is able to run builtin standard models. Run it with:
```
cd demo/
python demo.py --config-file ../configs/COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x.yaml \
--input input1.jpg input2.jpg \
[--other-options]
--opts MODEL.WEIGHTS detectron2://COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x/137849600/model_final_f10217.pkl
```
The configs are made for training, therefore we need to specify `MODEL.WEIGHTS` to a model from model zoo for evaluation.
This command will run the inference and show visualizations in an OpenCV window.
For details of the command line arguments, see `demo.py -h` or look at its source code
to understand its behavior. Some common arguments are:
* To run __on your webcam__, replace `--input files` with `--webcam`.
* To run __on a video__, replace `--input files` with `--video-input video.mp4`.
* To run __on cpu__, add `MODEL.DEVICE cpu` after `--opts`.
* To save outputs to a directory (for images) or a file (for webcam or video), use `--output`.
### Training & Evaluation in Command Line
We provide a script in "tools/{,plain_}train_net.py", that is made to train
all the configs provided in detectron2.
You may want to use it as a reference to write your own training script.
To train a model with "train_net.py", first
setup the corresponding datasets following
[datasets/README.md](./datasets/README.md),
then run:
```
cd tools/
./train_net.py --num-gpus 8 \
--config-file ../configs/COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_1x.yaml
```
The configs are made for 8-GPU training.
To train on 1 GPU, you may need to [change some parameters](https://arxiv.org/abs/1706.02677), e.g.:
```
./train_net.py \
--config-file ../configs/COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_1x.yaml \
--num-gpus 1 SOLVER.IMS_PER_BATCH 2 SOLVER.BASE_LR 0.0025
```
For most models, CPU training is not supported.
To evaluate a model's performance, use
```
./train_net.py \
--config-file ../configs/COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_1x.yaml \
--eval-only MODEL.WEIGHTS /path/to/checkpoint_file
```
For more options, see `./train_net.py -h`.
### Use Detectron2 APIs in Your Code
See our [Colab Notebook](https://colab.research.google.com/drive/16jcaJoc6bCFAQ96jDe2HwtXj7BMD_-m5)
to learn how to use detectron2 APIs to:
1. run inference with an existing model
2. train a builtin model on a custom dataset
See [detectron2/projects](https://github.com/facebookresearch/detectron2/tree/master/projects)
for more ways to build your project on detectron2.

View File

@ -0,0 +1,184 @@
## Installation
Our [Colab Notebook](https://colab.research.google.com/drive/16jcaJoc6bCFAQ96jDe2HwtXj7BMD_-m5)
has step-by-step instructions that install detectron2.
The [Dockerfile](docker)
also installs detectron2 with a few simple commands.
### Requirements
- Linux or macOS with Python ≥ 3.6
- PyTorch ≥ 1.4
- [torchvision](https://github.com/pytorch/vision/) that matches the PyTorch installation.
You can install them together at [pytorch.org](https://pytorch.org) to make sure of this.
- OpenCV, optional, needed by demo and visualization
- pycocotools: `pip install cython; pip install -U 'git+https://github.com/cocodataset/cocoapi.git#subdirectory=PythonAPI'`
### Build Detectron2 from Source
gcc & g++ ≥ 5 are required. [ninja](https://ninja-build.org/) is recommended for faster build.
After having them, run:
```
python -m pip install 'git+https://github.com/facebookresearch/detectron2.git'
# (add --user if you don't have permission)
# Or, to install it from a local clone:
git clone https://github.com/facebookresearch/detectron2.git
python -m pip install -e detectron2
# Or if you are on macOS
# CC=clang CXX=clang++ python -m pip install -e .
```
To __rebuild__ detectron2 that's built from a local clone, use `rm -rf build/ **/*.so` to clean the
old build first. You often need to rebuild detectron2 after reinstalling PyTorch.
### Install Pre-Built Detectron2 (Linux only)
```
# for CUDA 10.1:
python -m pip install detectron2 -f https://dl.fbaipublicfiles.com/detectron2/wheels/cu101/index.html
```
You can replace cu101 with "cu{100,92}" or "cpu".
Note that:
1. Such installation has to be used with certain version of official PyTorch release.
See [releases](https://github.com/facebookresearch/detectron2/releases) for requirements.
It will not work with a different version of PyTorch or a non-official build of PyTorch.
2. Such installation is out-of-date w.r.t. master branch of detectron2. It may not be
compatible with the master branch of a research project that uses detectron2 (e.g. those in
[projects](projects) or [meshrcnn](https://github.com/facebookresearch/meshrcnn/)).
### Common Installation Issues
If you met issues using the pre-built detectron2, please uninstall it and try building it from source.
Click each issue for its solutions:
<details>
<summary>
Undefined torch/aten/caffe2 symbols, or segmentation fault immediately when running the library.
</summary>
<br/>
This usually happens when detectron2 or torchvision is not
compiled with the version of PyTorch you're running.
Pre-built torchvision or detectron2 has to work with the corresponding official release of pytorch.
If the error comes from a pre-built torchvision, uninstall torchvision and pytorch and reinstall them
following [pytorch.org](http://pytorch.org). So the versions will match.
If the error comes from a pre-built detectron2, check [release notes](https://github.com/facebookresearch/detectron2/releases)
to see the corresponding pytorch version required for each pre-built detectron2.
If the error comes from detectron2 or torchvision that you built manually from source,
remove files you built (`build/`, `**/*.so`) and rebuild it so it can pick up the version of pytorch currently in your environment.
If you cannot resolve this problem, please include the output of `gdb -ex "r" -ex "bt" -ex "quit" --args python -m detectron2.utils.collect_env`
in your issue.
</details>
<details>
<summary>
Undefined C++ symbols (e.g. `GLIBCXX`) or C++ symbols not found.
</summary>
<br/>
Usually it's because the library is compiled with a newer C++ compiler but run with an old C++ runtime.
This often happens with old anaconda.
Try `conda update libgcc`. Then rebuild detectron2.
The fundamental solution is to run the code with proper C++ runtime.
One way is to use `LD_PRELOAD=/path/to/libstdc++.so`.
</details>
<details>
<summary>
"Not compiled with GPU support" or "Detectron2 CUDA Compiler: not available".
</summary>
<br/>
CUDA is not found when building detectron2.
You should make sure
```
python -c 'import torch; from torch.utils.cpp_extension import CUDA_HOME; print(torch.cuda.is_available(), CUDA_HOME)'
```
print valid outputs at the time you build detectron2.
Most models can run inference (but not training) without GPU support. To use CPUs, set `MODEL.DEVICE='cpu'` in the config.
</details>
<details>
<summary>
"invalid device function" or "no kernel image is available for execution".
</summary>
<br/>
Two possibilities:
* You build detectron2 with one version of CUDA but run it with a different version.
To check whether it is the case,
use `python -m detectron2.utils.collect_env` to find out inconsistent CUDA versions.
In the output of this command, you should expect "Detectron2 CUDA Compiler", "CUDA_HOME", "PyTorch built with - CUDA"
to contain cuda libraries of the same version.
When they are inconsistent,
you need to either install a different build of PyTorch (or build by yourself)
to match your local CUDA installation, or install a different version of CUDA to match PyTorch.
* Detectron2 or PyTorch/torchvision is not built for the correct GPU architecture (compute compatibility).
The GPU architecture for PyTorch/detectron2/torchvision is available in the "architecture flags" in
`python -m detectron2.utils.collect_env`.
The GPU architecture flags of detectron2/torchvision by default matches the GPU model detected
during compilation. This means the compiled code may not work on a different GPU model.
To overwrite the GPU architecture for detectron2/torchvision, use `TORCH_CUDA_ARCH_LIST` environment variable during compilation.
For example, `export TORCH_CUDA_ARCH_LIST=6.0,7.0` makes it compile for both P100s and V100s.
Visit [developer.nvidia.com/cuda-gpus](https://developer.nvidia.com/cuda-gpus) to find out
the correct compute compatibility number for your device.
</details>
<details>
<summary>
Undefined CUDA symbols; cannot open libcudart.so; other nvcc failures.
</summary>
<br/>
The version of NVCC you use to build detectron2 or torchvision does
not match the version of CUDA you are running with.
This often happens when using anaconda's CUDA runtime.
Use `python -m detectron2.utils.collect_env` to find out inconsistent CUDA versions.
In the output of this command, you should expect "Detectron2 CUDA Compiler", "CUDA_HOME", "PyTorch built with - CUDA"
to contain cuda libraries of the same version.
When they are inconsistent,
you need to either install a different build of PyTorch (or build by yourself)
to match your local CUDA installation, or install a different version of CUDA to match PyTorch.
</details>
<details>
<summary>
"ImportError: cannot import name '_C'".
</summary>
<br/>
Please build and install detectron2 following the instructions above.
If you are running code from detectron2's root directory, `cd` to a different one.
Otherwise you may not import the code that you installed.
</details>
<details>
<summary>
ONNX conversion segfault after some "TraceWarning".
</summary>
<br/>
The ONNX package is compiled with too old compiler.
Please build and install ONNX from its source code using a compiler
whose version is closer to what's used by PyTorch (available in `torch.__config__.show()`).
</details>

View File

@ -0,0 +1,201 @@
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
APPENDIX: How to apply the Apache License to your work.
To apply the Apache License to your work, attach the following
boilerplate notice, with the fields enclosed by brackets "[]"
replaced with your own identifying information. (Don't include
the brackets!) The text should be enclosed in the appropriate
comment syntax for the file format. We also recommend that a
file or class name and description of purpose be included on the
same "printed page" as the copyright notice for easier
identification within third-party archives.
Copyright 2019 - present, Facebook, Inc
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

View File

@ -0,0 +1,903 @@
# Detectron2 Model Zoo and Baselines
## Introduction
This file documents a large collection of baselines trained
with detectron2 in Sep-Oct, 2019.
All numbers were obtained on [Big Basin](https://engineering.fb.com/data-center-engineering/introducing-big-basin-our-next-generation-ai-hardware/)
servers with 8 NVIDIA V100 GPUs & NVLink. The software in use were PyTorch 1.3, CUDA 9.2, cuDNN 7.4.2 or 7.6.3.
You can access these models from code using [detectron2.model_zoo](https://detectron2.readthedocs.io/modules/model_zoo.html) APIs.
In addition to these official baseline models, you can find more models in [projects/](projects/).
#### How to Read the Tables
* The "Name" column contains a link to the config file. Running `tools/train_net.py` with this config file
and 8 GPUs will reproduce the model.
* Training speed is averaged across the entire training.
We keep updating the speed with latest version of detectron2/pytorch/etc.,
so they might be different from the `metrics` file.
Training speed for multi-machine jobs is not provided.
* Inference speed is measured by `tools/train_net.py --eval-only`, or [inference_on_dataset()](https://detectron2.readthedocs.io/modules/evaluation.html#detectron2.evaluation.inference_on_dataset),
with batch size 1 in detectron2 directly.
Measuring it with your own code will likely introduce other overhead.
Actual deployment in production should in general be faster than the given inference
speed due to more optimizations.
* The *model id* column is provided for ease of reference.
To check downloaded file integrity, any model on this page contains its md5 prefix in its file name.
* Training curves and other statistics can be found in `metrics` for each model.
#### Common Settings for COCO Models
* All COCO models were trained on `train2017` and evaluated on `val2017`.
* The default settings are __not directly comparable__ with Detectron's standard settings.
For example, our default training data augmentation uses scale jittering in addition to horizontal flipping.
To make fair comparisons with Detectron's settings, see
[Detectron1-Comparisons](configs/Detectron1-Comparisons/) for accuracy comparison,
and [benchmarks](https://detectron2.readthedocs.io/notes/benchmarks.html)
for speed comparison.
* For Faster/Mask R-CNN, we provide baselines based on __3 different backbone combinations__:
* __FPN__: Use a ResNet+FPN backbone with standard conv and FC heads for mask and box prediction,
respectively. It obtains the best
speed/accuracy tradeoff, but the other two are still useful for research.
* __C4__: Use a ResNet conv4 backbone with conv5 head. The original baseline in the Faster R-CNN paper.
* __DC5__ (Dilated-C5): Use a ResNet conv5 backbone with dilations in conv5, and standard conv and FC heads
for mask and box prediction, respectively.
This is used by the Deformable ConvNet paper.
* Most models are trained with the 3x schedule (~37 COCO epochs).
Although 1x models are heavily under-trained, we provide some ResNet-50 models with the 1x (~12 COCO epochs)
training schedule for comparison when doing quick research iteration.
#### ImageNet Pretrained Models
We provide backbone models pretrained on ImageNet-1k dataset.
These models have __different__ format from those provided in Detectron: we do not fuse BatchNorm into an affine layer.
* [R-50.pkl](https://dl.fbaipublicfiles.com/detectron2/ImageNetPretrained/MSRA/R-50.pkl): converted copy of [MSRA's original ResNet-50](https://github.com/KaimingHe/deep-residual-networks) model.
* [R-101.pkl](https://dl.fbaipublicfiles.com/detectron2/ImageNetPretrained/MSRA/R-101.pkl): converted copy of [MSRA's original ResNet-101](https://github.com/KaimingHe/deep-residual-networks) model.
* [X-101-32x8d.pkl](https://dl.fbaipublicfiles.com/detectron2/ImageNetPretrained/FAIR/X-101-32x8d.pkl): ResNeXt-101-32x8d model trained with Caffe2 at FB.
Pretrained models in Detectron's format can still be used. For example:
* [X-152-32x8d-IN5k.pkl](https://dl.fbaipublicfiles.com/detectron/ImageNetPretrained/25093814/X-152-32x8d-IN5k.pkl):
ResNeXt-152-32x8d model trained on ImageNet-5k with Caffe2 at FB (see ResNeXt paper for details on ImageNet-5k).
* [R-50-GN.pkl](https://dl.fbaipublicfiles.com/detectron/ImageNetPretrained/47261647/R-50-GN.pkl):
ResNet-50 with Group Normalization.
* [R-101-GN.pkl](https://dl.fbaipublicfiles.com/detectron/ImageNetPretrained/47592356/R-101-GN.pkl):
ResNet-101 with Group Normalization.
Torchvision's ResNet models can be used after converted by [this script](tools/convert-torchvision-to-d2.py).
#### License
All models available for download through this document are licensed under the
[Creative Commons Attribution-ShareAlike 3.0 license](https://creativecommons.org/licenses/by-sa/3.0/).
### COCO Object Detection Baselines
#### Faster R-CNN:
<!--
(fb only) To update the table in vim:
1. Remove the old table: d}
2. Copy the below command to the place of the table
3. :.!bash
./gen_html_table.py --config 'COCO-Detection/faster*50*'{1x,3x}'*' 'COCO-Detection/faster*101*' --name R50-C4 R50-DC5 R50-FPN R50-C4 R50-DC5 R50-FPN R101-C4 R101-DC5 R101-FPN X101-FPN --fields lr_sched train_speed inference_speed mem box_AP
-->
<table><tbody>
<!-- START TABLE -->
<!-- TABLE HEADER -->
<th valign="bottom">Name</th>
<th valign="bottom">lr<br/>sched</th>
<th valign="bottom">train<br/>time<br/>(s/iter)</th>
<th valign="bottom">inference<br/>time<br/>(s/im)</th>
<th valign="bottom">train<br/>mem<br/>(GB)</th>
<th valign="bottom">box<br/>AP</th>
<th valign="bottom">model id</th>
<th valign="bottom">download</th>
<!-- TABLE BODY -->
<!-- ROW: faster_rcnn_R_50_C4_1x -->
<tr><td align="left"><a href="configs/COCO-Detection/faster_rcnn_R_50_C4_1x.yaml">R50-C4</a></td>
<td align="center">1x</td>
<td align="center">0.551</td>
<td align="center">0.102</td>
<td align="center">4.8</td>
<td align="center">35.7</td>
<td align="center">137257644</td>
<td align="center"><a href="https://dl.fbaipublicfiles.com/detectron2/COCO-Detection/faster_rcnn_R_50_C4_1x/137257644/model_final_721ade.pkl">model</a>&nbsp;|&nbsp;<a href="https://dl.fbaipublicfiles.com/detectron2/COCO-Detection/faster_rcnn_R_50_C4_1x/137257644/metrics.json">metrics</a></td>
</tr>
<!-- ROW: faster_rcnn_R_50_DC5_1x -->
<tr><td align="left"><a href="configs/COCO-Detection/faster_rcnn_R_50_DC5_1x.yaml">R50-DC5</a></td>
<td align="center">1x</td>
<td align="center">0.380</td>
<td align="center">0.068</td>
<td align="center">5.0</td>
<td align="center">37.3</td>
<td align="center">137847829</td>
<td align="center"><a href="https://dl.fbaipublicfiles.com/detectron2/COCO-Detection/faster_rcnn_R_50_DC5_1x/137847829/model_final_51d356.pkl">model</a>&nbsp;|&nbsp;<a href="https://dl.fbaipublicfiles.com/detectron2/COCO-Detection/faster_rcnn_R_50_DC5_1x/137847829/metrics.json">metrics</a></td>
</tr>
<!-- ROW: faster_rcnn_R_50_FPN_1x -->
<tr><td align="left"><a href="configs/COCO-Detection/faster_rcnn_R_50_FPN_1x.yaml">R50-FPN</a></td>
<td align="center">1x</td>
<td align="center">0.210</td>
<td align="center">0.038</td>
<td align="center">3.0</td>
<td align="center">37.9</td>
<td align="center">137257794</td>
<td align="center"><a href="https://dl.fbaipublicfiles.com/detectron2/COCO-Detection/faster_rcnn_R_50_FPN_1x/137257794/model_final_b275ba.pkl">model</a>&nbsp;|&nbsp;<a href="https://dl.fbaipublicfiles.com/detectron2/COCO-Detection/faster_rcnn_R_50_FPN_1x/137257794/metrics.json">metrics</a></td>
</tr>
<!-- ROW: faster_rcnn_R_50_C4_3x -->
<tr><td align="left"><a href="configs/COCO-Detection/faster_rcnn_R_50_C4_3x.yaml">R50-C4</a></td>
<td align="center">3x</td>
<td align="center">0.543</td>
<td align="center">0.104</td>
<td align="center">4.8</td>
<td align="center">38.4</td>
<td align="center">137849393</td>
<td align="center"><a href="https://dl.fbaipublicfiles.com/detectron2/COCO-Detection/faster_rcnn_R_50_C4_3x/137849393/model_final_f97cb7.pkl">model</a>&nbsp;|&nbsp;<a href="https://dl.fbaipublicfiles.com/detectron2/COCO-Detection/faster_rcnn_R_50_C4_3x/137849393/metrics.json">metrics</a></td>
</tr>
<!-- ROW: faster_rcnn_R_50_DC5_3x -->
<tr><td align="left"><a href="configs/COCO-Detection/faster_rcnn_R_50_DC5_3x.yaml">R50-DC5</a></td>
<td align="center">3x</td>
<td align="center">0.378</td>
<td align="center">0.070</td>
<td align="center">5.0</td>
<td align="center">39.0</td>
<td align="center">137849425</td>
<td align="center"><a href="https://dl.fbaipublicfiles.com/detectron2/COCO-Detection/faster_rcnn_R_50_DC5_3x/137849425/model_final_68d202.pkl">model</a>&nbsp;|&nbsp;<a href="https://dl.fbaipublicfiles.com/detectron2/COCO-Detection/faster_rcnn_R_50_DC5_3x/137849425/metrics.json">metrics</a></td>
</tr>
<!-- ROW: faster_rcnn_R_50_FPN_3x -->
<tr><td align="left"><a href="configs/COCO-Detection/faster_rcnn_R_50_FPN_3x.yaml">R50-FPN</a></td>
<td align="center">3x</td>
<td align="center">0.209</td>
<td align="center">0.038</td>
<td align="center">3.0</td>
<td align="center">40.2</td>
<td align="center">137849458</td>
<td align="center"><a href="https://dl.fbaipublicfiles.com/detectron2/COCO-Detection/faster_rcnn_R_50_FPN_3x/137849458/model_final_280758.pkl">model</a>&nbsp;|&nbsp;<a href="https://dl.fbaipublicfiles.com/detectron2/COCO-Detection/faster_rcnn_R_50_FPN_3x/137849458/metrics.json">metrics</a></td>
</tr>
<!-- ROW: faster_rcnn_R_101_C4_3x -->
<tr><td align="left"><a href="configs/COCO-Detection/faster_rcnn_R_101_C4_3x.yaml">R101-C4</a></td>
<td align="center">3x</td>
<td align="center">0.619</td>
<td align="center">0.139</td>
<td align="center">5.9</td>
<td align="center">41.1</td>
<td align="center">138204752</td>
<td align="center"><a href="https://dl.fbaipublicfiles.com/detectron2/COCO-Detection/faster_rcnn_R_101_C4_3x/138204752/model_final_298dad.pkl">model</a>&nbsp;|&nbsp;<a href="https://dl.fbaipublicfiles.com/detectron2/COCO-Detection/faster_rcnn_R_101_C4_3x/138204752/metrics.json">metrics</a></td>
</tr>
<!-- ROW: faster_rcnn_R_101_DC5_3x -->
<tr><td align="left"><a href="configs/COCO-Detection/faster_rcnn_R_101_DC5_3x.yaml">R101-DC5</a></td>
<td align="center">3x</td>
<td align="center">0.452</td>
<td align="center">0.086</td>
<td align="center">6.1</td>
<td align="center">40.6</td>
<td align="center">138204841</td>
<td align="center"><a href="https://dl.fbaipublicfiles.com/detectron2/COCO-Detection/faster_rcnn_R_101_DC5_3x/138204841/model_final_3e0943.pkl">model</a>&nbsp;|&nbsp;<a href="https://dl.fbaipublicfiles.com/detectron2/COCO-Detection/faster_rcnn_R_101_DC5_3x/138204841/metrics.json">metrics</a></td>
</tr>
<!-- ROW: faster_rcnn_R_101_FPN_3x -->
<tr><td align="left"><a href="configs/COCO-Detection/faster_rcnn_R_101_FPN_3x.yaml">R101-FPN</a></td>
<td align="center">3x</td>
<td align="center">0.286</td>
<td align="center">0.051</td>
<td align="center">4.1</td>
<td align="center">42.0</td>
<td align="center">137851257</td>
<td align="center"><a href="https://dl.fbaipublicfiles.com/detectron2/COCO-Detection/faster_rcnn_R_101_FPN_3x/137851257/model_final_f6e8b1.pkl">model</a>&nbsp;|&nbsp;<a href="https://dl.fbaipublicfiles.com/detectron2/COCO-Detection/faster_rcnn_R_101_FPN_3x/137851257/metrics.json">metrics</a></td>
</tr>
<!-- ROW: faster_rcnn_X_101_32x8d_FPN_3x -->
<tr><td align="left"><a href="configs/COCO-Detection/faster_rcnn_X_101_32x8d_FPN_3x.yaml">X101-FPN</a></td>
<td align="center">3x</td>
<td align="center">0.638</td>
<td align="center">0.098</td>
<td align="center">6.7</td>
<td align="center">43.0</td>
<td align="center">139173657</td>
<td align="center"><a href="https://dl.fbaipublicfiles.com/detectron2/COCO-Detection/faster_rcnn_X_101_32x8d_FPN_3x/139173657/model_final_68b088.pkl">model</a>&nbsp;|&nbsp;<a href="https://dl.fbaipublicfiles.com/detectron2/COCO-Detection/faster_rcnn_X_101_32x8d_FPN_3x/139173657/metrics.json">metrics</a></td>
</tr>
</tbody></table>
#### RetinaNet:
<!--
./gen_html_table.py --config 'COCO-Detection/retina*50*' 'COCO-Detection/retina*101*' --name R50 R50 R101 --fields lr_sched train_speed inference_speed mem box_AP
-->
<table><tbody>
<!-- START TABLE -->
<!-- TABLE HEADER -->
<th valign="bottom">Name</th>
<th valign="bottom">lr<br/>sched</th>
<th valign="bottom">train<br/>time<br/>(s/iter)</th>
<th valign="bottom">inference<br/>time<br/>(s/im)</th>
<th valign="bottom">train<br/>mem<br/>(GB)</th>
<th valign="bottom">box<br/>AP</th>
<th valign="bottom">model id</th>
<th valign="bottom">download</th>
<!-- TABLE BODY -->
<!-- ROW: retinanet_R_50_FPN_1x -->
<tr><td align="left"><a href="configs/COCO-Detection/retinanet_R_50_FPN_1x.yaml">R50</a></td>
<td align="center">1x</td>
<td align="center">0.200</td>
<td align="center">0.055</td>
<td align="center">3.9</td>
<td align="center">36.5</td>
<td align="center">137593951</td>
<td align="center"><a href="https://dl.fbaipublicfiles.com/detectron2/COCO-Detection/retinanet_R_50_FPN_1x/137593951/model_final_b796dc.pkl">model</a>&nbsp;|&nbsp;<a href="https://dl.fbaipublicfiles.com/detectron2/COCO-Detection/retinanet_R_50_FPN_1x/137593951/metrics.json">metrics</a></td>
</tr>
<!-- ROW: retinanet_R_50_FPN_3x -->
<tr><td align="left"><a href="configs/COCO-Detection/retinanet_R_50_FPN_3x.yaml">R50</a></td>
<td align="center">3x</td>
<td align="center">0.201</td>
<td align="center">0.055</td>
<td align="center">3.9</td>
<td align="center">37.9</td>
<td align="center">137849486</td>
<td align="center"><a href="https://dl.fbaipublicfiles.com/detectron2/COCO-Detection/retinanet_R_50_FPN_3x/137849486/model_final_4cafe0.pkl">model</a>&nbsp;|&nbsp;<a href="https://dl.fbaipublicfiles.com/detectron2/COCO-Detection/retinanet_R_50_FPN_3x/137849486/metrics.json">metrics</a></td>
</tr>
<!-- ROW: retinanet_R_101_FPN_3x -->
<tr><td align="left"><a href="configs/COCO-Detection/retinanet_R_101_FPN_3x.yaml">R101</a></td>
<td align="center">3x</td>
<td align="center">0.280</td>
<td align="center">0.068</td>
<td align="center">5.1</td>
<td align="center">39.9</td>
<td align="center">138363263</td>
<td align="center"><a href="https://dl.fbaipublicfiles.com/detectron2/COCO-Detection/retinanet_R_101_FPN_3x/138363263/model_final_59f53c.pkl">model</a>&nbsp;|&nbsp;<a href="https://dl.fbaipublicfiles.com/detectron2/COCO-Detection/retinanet_R_101_FPN_3x/138363263/metrics.json">metrics</a></td>
</tr>
</tbody></table>
#### RPN & Fast R-CNN:
<!--
./gen_html_table.py --config 'COCO-Detection/rpn*' 'COCO-Detection/fast_rcnn*' --name "RPN R50-C4" "RPN R50-FPN" "Fast R-CNN R50-FPN" --fields lr_sched train_speed inference_speed mem box_AP prop_AR
-->
<table><tbody>
<!-- START TABLE -->
<!-- TABLE HEADER -->
<th valign="bottom">Name</th>
<th valign="bottom">lr<br/>sched</th>
<th valign="bottom">train<br/>time<br/>(s/iter)</th>
<th valign="bottom">inference<br/>time<br/>(s/im)</th>
<th valign="bottom">train<br/>mem<br/>(GB)</th>
<th valign="bottom">box<br/>AP</th>
<th valign="bottom">prop.<br/>AR</th>
<th valign="bottom">model id</th>
<th valign="bottom">download</th>
<!-- TABLE BODY -->
<!-- ROW: rpn_R_50_C4_1x -->
<tr><td align="left"><a href="configs/COCO-Detection/rpn_R_50_C4_1x.yaml">RPN R50-C4</a></td>
<td align="center">1x</td>
<td align="center">0.130</td>
<td align="center">0.034</td>
<td align="center">1.5</td>
<td align="center"></td>
<td align="center">51.6</td>
<td align="center">137258005</td>
<td align="center"><a href="https://dl.fbaipublicfiles.com/detectron2/COCO-Detection/rpn_R_50_C4_1x/137258005/model_final_450694.pkl">model</a>&nbsp;|&nbsp;<a href="https://dl.fbaipublicfiles.com/detectron2/COCO-Detection/rpn_R_50_C4_1x/137258005/metrics.json">metrics</a></td>
</tr>
<!-- ROW: rpn_R_50_FPN_1x -->
<tr><td align="left"><a href="configs/COCO-Detection/rpn_R_50_FPN_1x.yaml">RPN R50-FPN</a></td>
<td align="center">1x</td>
<td align="center">0.186</td>
<td align="center">0.032</td>
<td align="center">2.7</td>
<td align="center"></td>
<td align="center">58.0</td>
<td align="center">137258492</td>
<td align="center"><a href="https://dl.fbaipublicfiles.com/detectron2/COCO-Detection/rpn_R_50_FPN_1x/137258492/model_final_02ce48.pkl">model</a>&nbsp;|&nbsp;<a href="https://dl.fbaipublicfiles.com/detectron2/COCO-Detection/rpn_R_50_FPN_1x/137258492/metrics.json">metrics</a></td>
</tr>
<!-- ROW: fast_rcnn_R_50_FPN_1x -->
<tr><td align="left"><a href="configs/COCO-Detection/fast_rcnn_R_50_FPN_1x.yaml">Fast R-CNN R50-FPN</a></td>
<td align="center">1x</td>
<td align="center">0.140</td>
<td align="center">0.029</td>
<td align="center">2.6</td>
<td align="center">37.8</td>
<td align="center"></td>
<td align="center">137635226</td>
<td align="center"><a href="https://dl.fbaipublicfiles.com/detectron2/COCO-Detection/fast_rcnn_R_50_FPN_1x/137635226/model_final_e5f7ce.pkl">model</a>&nbsp;|&nbsp;<a href="https://dl.fbaipublicfiles.com/detectron2/COCO-Detection/fast_rcnn_R_50_FPN_1x/137635226/metrics.json">metrics</a></td>
</tr>
</tbody></table>
### COCO Instance Segmentation Baselines with Mask R-CNN
<!--
./gen_html_table.py --config 'COCO-InstanceSegmentation/mask*50*'{1x,3x}'*' 'COCO-InstanceSegmentation/mask*101*' --name R50-C4 R50-DC5 R50-FPN R50-C4 R50-DC5 R50-FPN R101-C4 R101-DC5 R101-FPN X101-FPN --fields lr_sched train_speed inference_speed mem box_AP mask_AP
-->
<table><tbody>
<!-- START TABLE -->
<!-- TABLE HEADER -->
<th valign="bottom">Name</th>
<th valign="bottom">lr<br/>sched</th>
<th valign="bottom">train<br/>time<br/>(s/iter)</th>
<th valign="bottom">inference<br/>time<br/>(s/im)</th>
<th valign="bottom">train<br/>mem<br/>(GB)</th>
<th valign="bottom">box<br/>AP</th>
<th valign="bottom">mask<br/>AP</th>
<th valign="bottom">model id</th>
<th valign="bottom">download</th>
<!-- TABLE BODY -->
<!-- ROW: mask_rcnn_R_50_C4_1x -->
<tr><td align="left"><a href="configs/COCO-InstanceSegmentation/mask_rcnn_R_50_C4_1x.yaml">R50-C4</a></td>
<td align="center">1x</td>
<td align="center">0.584</td>
<td align="center">0.110</td>
<td align="center">5.2</td>
<td align="center">36.8</td>
<td align="center">32.2</td>
<td align="center">137259246</td>
<td align="center"><a href="https://dl.fbaipublicfiles.com/detectron2/COCO-InstanceSegmentation/mask_rcnn_R_50_C4_1x/137259246/model_final_9243eb.pkl">model</a>&nbsp;|&nbsp;<a href="https://dl.fbaipublicfiles.com/detectron2/COCO-InstanceSegmentation/mask_rcnn_R_50_C4_1x/137259246/metrics.json">metrics</a></td>
</tr>
<!-- ROW: mask_rcnn_R_50_DC5_1x -->
<tr><td align="left"><a href="configs/COCO-InstanceSegmentation/mask_rcnn_R_50_DC5_1x.yaml">R50-DC5</a></td>
<td align="center">1x</td>
<td align="center">0.471</td>
<td align="center">0.076</td>
<td align="center">6.5</td>
<td align="center">38.3</td>
<td align="center">34.2</td>
<td align="center">137260150</td>
<td align="center"><a href="https://dl.fbaipublicfiles.com/detectron2/COCO-InstanceSegmentation/mask_rcnn_R_50_DC5_1x/137260150/model_final_4f86c3.pkl">model</a>&nbsp;|&nbsp;<a href="https://dl.fbaipublicfiles.com/detectron2/COCO-InstanceSegmentation/mask_rcnn_R_50_DC5_1x/137260150/metrics.json">metrics</a></td>
</tr>
<!-- ROW: mask_rcnn_R_50_FPN_1x -->
<tr><td align="left"><a href="configs/COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_1x.yaml">R50-FPN</a></td>
<td align="center">1x</td>
<td align="center">0.261</td>
<td align="center">0.043</td>
<td align="center">3.4</td>
<td align="center">38.6</td>
<td align="center">35.2</td>
<td align="center">137260431</td>
<td align="center"><a href="https://dl.fbaipublicfiles.com/detectron2/COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_1x/137260431/model_final_a54504.pkl">model</a>&nbsp;|&nbsp;<a href="https://dl.fbaipublicfiles.com/detectron2/COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_1x/137260431/metrics.json">metrics</a></td>
</tr>
<!-- ROW: mask_rcnn_R_50_C4_3x -->
<tr><td align="left"><a href="configs/COCO-InstanceSegmentation/mask_rcnn_R_50_C4_3x.yaml">R50-C4</a></td>
<td align="center">3x</td>
<td align="center">0.575</td>
<td align="center">0.111</td>
<td align="center">5.2</td>
<td align="center">39.8</td>
<td align="center">34.4</td>
<td align="center">137849525</td>
<td align="center"><a href="https://dl.fbaipublicfiles.com/detectron2/COCO-InstanceSegmentation/mask_rcnn_R_50_C4_3x/137849525/model_final_4ce675.pkl">model</a>&nbsp;|&nbsp;<a href="https://dl.fbaipublicfiles.com/detectron2/COCO-InstanceSegmentation/mask_rcnn_R_50_C4_3x/137849525/metrics.json">metrics</a></td>
</tr>
<!-- ROW: mask_rcnn_R_50_DC5_3x -->
<tr><td align="left"><a href="configs/COCO-InstanceSegmentation/mask_rcnn_R_50_DC5_3x.yaml">R50-DC5</a></td>
<td align="center">3x</td>
<td align="center">0.470</td>
<td align="center">0.076</td>
<td align="center">6.5</td>
<td align="center">40.0</td>
<td align="center">35.9</td>
<td align="center">137849551</td>
<td align="center"><a href="https://dl.fbaipublicfiles.com/detectron2/COCO-InstanceSegmentation/mask_rcnn_R_50_DC5_3x/137849551/model_final_84107b.pkl">model</a>&nbsp;|&nbsp;<a href="https://dl.fbaipublicfiles.com/detectron2/COCO-InstanceSegmentation/mask_rcnn_R_50_DC5_3x/137849551/metrics.json">metrics</a></td>
</tr>
<!-- ROW: mask_rcnn_R_50_FPN_3x -->
<tr><td align="left"><a href="configs/COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x.yaml">R50-FPN</a></td>
<td align="center">3x</td>
<td align="center">0.261</td>
<td align="center">0.043</td>
<td align="center">3.4</td>
<td align="center">41.0</td>
<td align="center">37.2</td>
<td align="center">137849600</td>
<td align="center"><a href="https://dl.fbaipublicfiles.com/detectron2/COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x/137849600/model_final_f10217.pkl">model</a>&nbsp;|&nbsp;<a href="https://dl.fbaipublicfiles.com/detectron2/COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x/137849600/metrics.json">metrics</a></td>
</tr>
<!-- ROW: mask_rcnn_R_101_C4_3x -->
<tr><td align="left"><a href="configs/COCO-InstanceSegmentation/mask_rcnn_R_101_C4_3x.yaml">R101-C4</a></td>
<td align="center">3x</td>
<td align="center">0.652</td>
<td align="center">0.145</td>
<td align="center">6.3</td>
<td align="center">42.6</td>
<td align="center">36.7</td>
<td align="center">138363239</td>
<td align="center"><a href="https://dl.fbaipublicfiles.com/detectron2/COCO-InstanceSegmentation/mask_rcnn_R_101_C4_3x/138363239/model_final_a2914c.pkl">model</a>&nbsp;|&nbsp;<a href="https://dl.fbaipublicfiles.com/detectron2/COCO-InstanceSegmentation/mask_rcnn_R_101_C4_3x/138363239/metrics.json">metrics</a></td>
</tr>
<!-- ROW: mask_rcnn_R_101_DC5_3x -->
<tr><td align="left"><a href="configs/COCO-InstanceSegmentation/mask_rcnn_R_101_DC5_3x.yaml">R101-DC5</a></td>
<td align="center">3x</td>
<td align="center">0.545</td>
<td align="center">0.092</td>
<td align="center">7.6</td>
<td align="center">41.9</td>
<td align="center">37.3</td>
<td align="center">138363294</td>
<td align="center"><a href="https://dl.fbaipublicfiles.com/detectron2/COCO-InstanceSegmentation/mask_rcnn_R_101_DC5_3x/138363294/model_final_0464b7.pkl">model</a>&nbsp;|&nbsp;<a href="https://dl.fbaipublicfiles.com/detectron2/COCO-InstanceSegmentation/mask_rcnn_R_101_DC5_3x/138363294/metrics.json">metrics</a></td>
</tr>
<!-- ROW: mask_rcnn_R_101_FPN_3x -->
<tr><td align="left"><a href="configs/COCO-InstanceSegmentation/mask_rcnn_R_101_FPN_3x.yaml">R101-FPN</a></td>
<td align="center">3x</td>
<td align="center">0.340</td>
<td align="center">0.056</td>
<td align="center">4.6</td>
<td align="center">42.9</td>
<td align="center">38.6</td>
<td align="center">138205316</td>
<td align="center"><a href="https://dl.fbaipublicfiles.com/detectron2/COCO-InstanceSegmentation/mask_rcnn_R_101_FPN_3x/138205316/model_final_a3ec72.pkl">model</a>&nbsp;|&nbsp;<a href="https://dl.fbaipublicfiles.com/detectron2/COCO-InstanceSegmentation/mask_rcnn_R_101_FPN_3x/138205316/metrics.json">metrics</a></td>
</tr>
<!-- ROW: mask_rcnn_X_101_32x8d_FPN_3x -->
<tr><td align="left"><a href="configs/COCO-InstanceSegmentation/mask_rcnn_X_101_32x8d_FPN_3x.yaml">X101-FPN</a></td>
<td align="center">3x</td>
<td align="center">0.690</td>
<td align="center">0.103</td>
<td align="center">7.2</td>
<td align="center">44.3</td>
<td align="center">39.5</td>
<td align="center">139653917</td>
<td align="center"><a href="https://dl.fbaipublicfiles.com/detectron2/COCO-InstanceSegmentation/mask_rcnn_X_101_32x8d_FPN_3x/139653917/model_final_2d9806.pkl">model</a>&nbsp;|&nbsp;<a href="https://dl.fbaipublicfiles.com/detectron2/COCO-InstanceSegmentation/mask_rcnn_X_101_32x8d_FPN_3x/139653917/metrics.json">metrics</a></td>
</tr>
</tbody></table>
### COCO Person Keypoint Detection Baselines with Keypoint R-CNN
<!--
./gen_html_table.py --config 'COCO-Keypoints/*50*' 'COCO-Keypoints/*101*' --name R50-FPN R50-FPN R101-FPN X101-FPN --fields lr_sched train_speed inference_speed mem box_AP keypoint_AP
-->
<table><tbody>
<!-- START TABLE -->
<!-- TABLE HEADER -->
<th valign="bottom">Name</th>
<th valign="bottom">lr<br/>sched</th>
<th valign="bottom">train<br/>time<br/>(s/iter)</th>
<th valign="bottom">inference<br/>time<br/>(s/im)</th>
<th valign="bottom">train<br/>mem<br/>(GB)</th>
<th valign="bottom">box<br/>AP</th>
<th valign="bottom">kp.<br/>AP</th>
<th valign="bottom">model id</th>
<th valign="bottom">download</th>
<!-- TABLE BODY -->
<!-- ROW: keypoint_rcnn_R_50_FPN_1x -->
<tr><td align="left"><a href="configs/COCO-Keypoints/keypoint_rcnn_R_50_FPN_1x.yaml">R50-FPN</a></td>
<td align="center">1x</td>
<td align="center">0.315</td>
<td align="center">0.072</td>
<td align="center">5.0</td>
<td align="center">53.6</td>
<td align="center">64.0</td>
<td align="center">137261548</td>
<td align="center"><a href="https://dl.fbaipublicfiles.com/detectron2/COCO-Keypoints/keypoint_rcnn_R_50_FPN_1x/137261548/model_final_04e291.pkl">model</a>&nbsp;|&nbsp;<a href="https://dl.fbaipublicfiles.com/detectron2/COCO-Keypoints/keypoint_rcnn_R_50_FPN_1x/137261548/metrics.json">metrics</a></td>
</tr>
<!-- ROW: keypoint_rcnn_R_50_FPN_3x -->
<tr><td align="left"><a href="configs/COCO-Keypoints/keypoint_rcnn_R_50_FPN_3x.yaml">R50-FPN</a></td>
<td align="center">3x</td>
<td align="center">0.316</td>
<td align="center">0.066</td>
<td align="center">5.0</td>
<td align="center">55.4</td>
<td align="center">65.5</td>
<td align="center">137849621</td>
<td align="center"><a href="https://dl.fbaipublicfiles.com/detectron2/COCO-Keypoints/keypoint_rcnn_R_50_FPN_3x/137849621/model_final_a6e10b.pkl">model</a>&nbsp;|&nbsp;<a href="https://dl.fbaipublicfiles.com/detectron2/COCO-Keypoints/keypoint_rcnn_R_50_FPN_3x/137849621/metrics.json">metrics</a></td>
</tr>
<!-- ROW: keypoint_rcnn_R_101_FPN_3x -->
<tr><td align="left"><a href="configs/COCO-Keypoints/keypoint_rcnn_R_101_FPN_3x.yaml">R101-FPN</a></td>
<td align="center">3x</td>
<td align="center">0.390</td>
<td align="center">0.076</td>
<td align="center">6.1</td>
<td align="center">56.4</td>
<td align="center">66.1</td>
<td align="center">138363331</td>
<td align="center"><a href="https://dl.fbaipublicfiles.com/detectron2/COCO-Keypoints/keypoint_rcnn_R_101_FPN_3x/138363331/model_final_997cc7.pkl">model</a>&nbsp;|&nbsp;<a href="https://dl.fbaipublicfiles.com/detectron2/COCO-Keypoints/keypoint_rcnn_R_101_FPN_3x/138363331/metrics.json">metrics</a></td>
</tr>
<!-- ROW: keypoint_rcnn_X_101_32x8d_FPN_3x -->
<tr><td align="left"><a href="configs/COCO-Keypoints/keypoint_rcnn_X_101_32x8d_FPN_3x.yaml">X101-FPN</a></td>
<td align="center">3x</td>
<td align="center">0.738</td>
<td align="center">0.121</td>
<td align="center">8.7</td>
<td align="center">57.3</td>
<td align="center">66.0</td>
<td align="center">139686956</td>
<td align="center"><a href="https://dl.fbaipublicfiles.com/detectron2/COCO-Keypoints/keypoint_rcnn_X_101_32x8d_FPN_3x/139686956/model_final_5ad38f.pkl">model</a>&nbsp;|&nbsp;<a href="https://dl.fbaipublicfiles.com/detectron2/COCO-Keypoints/keypoint_rcnn_X_101_32x8d_FPN_3x/139686956/metrics.json">metrics</a></td>
</tr>
</tbody></table>
### COCO Panoptic Segmentation Baselines with Panoptic FPN
<!--
./gen_html_table.py --config 'COCO-PanopticSegmentation/*50*' 'COCO-PanopticSegmentation/*101*' --name R50-FPN R50-FPN R101-FPN --fields lr_sched train_speed inference_speed mem box_AP mask_AP PQ
-->
<table><tbody>
<!-- START TABLE -->
<!-- TABLE HEADER -->
<th valign="bottom">Name</th>
<th valign="bottom">lr<br/>sched</th>
<th valign="bottom">train<br/>time<br/>(s/iter)</th>
<th valign="bottom">inference<br/>time<br/>(s/im)</th>
<th valign="bottom">train<br/>mem<br/>(GB)</th>
<th valign="bottom">box<br/>AP</th>
<th valign="bottom">mask<br/>AP</th>
<th valign="bottom">PQ</th>
<th valign="bottom">model id</th>
<th valign="bottom">download</th>
<!-- TABLE BODY -->
<!-- ROW: panoptic_fpn_R_50_1x -->
<tr><td align="left"><a href="configs/COCO-PanopticSegmentation/panoptic_fpn_R_50_1x.yaml">R50-FPN</a></td>
<td align="center">1x</td>
<td align="center">0.304</td>
<td align="center">0.053</td>
<td align="center">4.8</td>
<td align="center">37.6</td>
<td align="center">34.7</td>
<td align="center">39.4</td>
<td align="center">139514544</td>
<td align="center"><a href="https://dl.fbaipublicfiles.com/detectron2/COCO-PanopticSegmentation/panoptic_fpn_R_50_1x/139514544/model_final_dbfeb4.pkl">model</a>&nbsp;|&nbsp;<a href="https://dl.fbaipublicfiles.com/detectron2/COCO-PanopticSegmentation/panoptic_fpn_R_50_1x/139514544/metrics.json">metrics</a></td>
</tr>
<!-- ROW: panoptic_fpn_R_50_3x -->
<tr><td align="left"><a href="configs/COCO-PanopticSegmentation/panoptic_fpn_R_50_3x.yaml">R50-FPN</a></td>
<td align="center">3x</td>
<td align="center">0.302</td>
<td align="center">0.053</td>
<td align="center">4.8</td>
<td align="center">40.0</td>
<td align="center">36.5</td>
<td align="center">41.5</td>
<td align="center">139514569</td>
<td align="center"><a href="https://dl.fbaipublicfiles.com/detectron2/COCO-PanopticSegmentation/panoptic_fpn_R_50_3x/139514569/model_final_c10459.pkl">model</a>&nbsp;|&nbsp;<a href="https://dl.fbaipublicfiles.com/detectron2/COCO-PanopticSegmentation/panoptic_fpn_R_50_3x/139514569/metrics.json">metrics</a></td>
</tr>
<!-- ROW: panoptic_fpn_R_101_3x -->
<tr><td align="left"><a href="configs/COCO-PanopticSegmentation/panoptic_fpn_R_101_3x.yaml">R101-FPN</a></td>
<td align="center">3x</td>
<td align="center">0.392</td>
<td align="center">0.066</td>
<td align="center">6.0</td>
<td align="center">42.4</td>
<td align="center">38.5</td>
<td align="center">43.0</td>
<td align="center">139514519</td>
<td align="center"><a href="https://dl.fbaipublicfiles.com/detectron2/COCO-PanopticSegmentation/panoptic_fpn_R_101_3x/139514519/model_final_cafdb1.pkl">model</a>&nbsp;|&nbsp;<a href="https://dl.fbaipublicfiles.com/detectron2/COCO-PanopticSegmentation/panoptic_fpn_R_101_3x/139514519/metrics.json">metrics</a></td>
</tr>
</tbody></table>
### LVIS Instance Segmentation Baselines with Mask R-CNN
Mask R-CNN baselines on the [LVIS dataset](https://lvisdataset.org), v0.5.
These baselines are described in Table 3(c) of the [LVIS paper](https://arxiv.org/abs/1908.03195).
NOTE: the 1x schedule here has the same amount of __iterations__ as the COCO 1x baselines.
They are roughly 24 epochs of LVISv0.5 data.
The final results of these configs have large variance across different runs.
<!--
./gen_html_table.py --config 'LVIS-InstanceSegmentation/mask*50*' 'LVIS-InstanceSegmentation/mask*101*' --name R50-FPN R101-FPN X101-FPN --fields lr_sched train_speed inference_speed mem box_AP mask_AP
-->
<table><tbody>
<!-- START TABLE -->
<!-- TABLE HEADER -->
<th valign="bottom">Name</th>
<th valign="bottom">lr<br/>sched</th>
<th valign="bottom">train<br/>time<br/>(s/iter)</th>
<th valign="bottom">inference<br/>time<br/>(s/im)</th>
<th valign="bottom">train<br/>mem<br/>(GB)</th>
<th valign="bottom">box<br/>AP</th>
<th valign="bottom">mask<br/>AP</th>
<th valign="bottom">model id</th>
<th valign="bottom">download</th>
<!-- TABLE BODY -->
<!-- ROW: mask_rcnn_R_50_FPN_1x -->
<tr><td align="left"><a href="configs/LVIS-InstanceSegmentation/mask_rcnn_R_50_FPN_1x.yaml">R50-FPN</a></td>
<td align="center">1x</td>
<td align="center">0.292</td>
<td align="center">0.107</td>
<td align="center">7.1</td>
<td align="center">23.6</td>
<td align="center">24.4</td>
<td align="center">144219072</td>
<td align="center"><a href="https://dl.fbaipublicfiles.com/detectron2/LVIS-InstanceSegmentation/mask_rcnn_R_50_FPN_1x/144219072/model_final_571f7c.pkl">model</a>&nbsp;|&nbsp;<a href="https://dl.fbaipublicfiles.com/detectron2/LVIS-InstanceSegmentation/mask_rcnn_R_50_FPN_1x/144219072/metrics.json">metrics</a></td>
</tr>
<!-- ROW: mask_rcnn_R_101_FPN_1x -->
<tr><td align="left"><a href="configs/LVIS-InstanceSegmentation/mask_rcnn_R_101_FPN_1x.yaml">R101-FPN</a></td>
<td align="center">1x</td>
<td align="center">0.371</td>
<td align="center">0.114</td>
<td align="center">7.8</td>
<td align="center">25.6</td>
<td align="center">25.9</td>
<td align="center">144219035</td>
<td align="center"><a href="https://dl.fbaipublicfiles.com/detectron2/LVIS-InstanceSegmentation/mask_rcnn_R_101_FPN_1x/144219035/model_final_824ab5.pkl">model</a>&nbsp;|&nbsp;<a href="https://dl.fbaipublicfiles.com/detectron2/LVIS-InstanceSegmentation/mask_rcnn_R_101_FPN_1x/144219035/metrics.json">metrics</a></td>
</tr>
<!-- ROW: mask_rcnn_X_101_32x8d_FPN_1x -->
<tr><td align="left"><a href="configs/LVIS-InstanceSegmentation/mask_rcnn_X_101_32x8d_FPN_1x.yaml">X101-FPN</a></td>
<td align="center">1x</td>
<td align="center">0.712</td>
<td align="center">0.151</td>
<td align="center">10.2</td>
<td align="center">26.7</td>
<td align="center">27.1</td>
<td align="center">144219108</td>
<td align="center"><a href="https://dl.fbaipublicfiles.com/detectron2/LVIS-InstanceSegmentation/mask_rcnn_X_101_32x8d_FPN_1x/144219108/model_final_5e3439.pkl">model</a>&nbsp;|&nbsp;<a href="https://dl.fbaipublicfiles.com/detectron2/LVIS-InstanceSegmentation/mask_rcnn_X_101_32x8d_FPN_1x/144219108/metrics.json">metrics</a></td>
</tr>
</tbody></table>
### Cityscapes & Pascal VOC Baselines
Simple baselines for
* Mask R-CNN on Cityscapes instance segmentation (initialized from COCO pre-training, then trained on Cityscapes fine annotations only)
* Faster R-CNN on PASCAL VOC object detection (trained on VOC 2007 train+val + VOC 2012 train+val, tested on VOC 2007 using 11-point interpolated AP)
<!--
./gen_html_table.py --config 'Cityscapes/*' 'PascalVOC-Detection/*' --name "R50-FPN, Cityscapes" "R50-C4, VOC" --fields train_speed inference_speed mem box_AP box_AP50 mask_AP
-->
<table><tbody>
<!-- START TABLE -->
<!-- TABLE HEADER -->
<th valign="bottom">Name</th>
<th valign="bottom">train<br/>time<br/>(s/iter)</th>
<th valign="bottom">inference<br/>time<br/>(s/im)</th>
<th valign="bottom">train<br/>mem<br/>(GB)</th>
<th valign="bottom">box<br/>AP</th>
<th valign="bottom">box<br/>AP50</th>
<th valign="bottom">mask<br/>AP</th>
<th valign="bottom">model id</th>
<th valign="bottom">download</th>
<!-- TABLE BODY -->
<!-- ROW: mask_rcnn_R_50_FPN -->
<tr><td align="left"><a href="configs/Cityscapes/mask_rcnn_R_50_FPN.yaml">R50-FPN, Cityscapes</a></td>
<td align="center">0.240</td>
<td align="center">0.078</td>
<td align="center">4.4</td>
<td align="center"></td>
<td align="center"></td>
<td align="center">36.5</td>
<td align="center">142423278</td>
<td align="center"><a href="https://dl.fbaipublicfiles.com/detectron2/Cityscapes/mask_rcnn_R_50_FPN/142423278/model_final_af9cf5.pkl">model</a>&nbsp;|&nbsp;<a href="https://dl.fbaipublicfiles.com/detectron2/Cityscapes/mask_rcnn_R_50_FPN/142423278/metrics.json">metrics</a></td>
</tr>
<!-- ROW: faster_rcnn_R_50_C4 -->
<tr><td align="left"><a href="configs/PascalVOC-Detection/faster_rcnn_R_50_C4.yaml">R50-C4, VOC</a></td>
<td align="center">0.537</td>
<td align="center">0.081</td>
<td align="center">4.8</td>
<td align="center">51.9</td>
<td align="center">80.3</td>
<td align="center"></td>
<td align="center">142202221</td>
<td align="center"><a href="https://dl.fbaipublicfiles.com/detectron2/PascalVOC-Detection/faster_rcnn_R_50_C4/142202221/model_final_b1acc2.pkl">model</a>&nbsp;|&nbsp;<a href="https://dl.fbaipublicfiles.com/detectron2/PascalVOC-Detection/faster_rcnn_R_50_C4/142202221/metrics.json">metrics</a></td>
</tr>
</tbody></table>
### Other Settings
Ablations for Deformable Conv and Cascade R-CNN:
<!--
./gen_html_table.py --config 'COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_1x.yaml' 'Misc/*R_50_FPN_1x_dconv*' 'Misc/cascade*1x.yaml' 'COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x.yaml' 'Misc/*R_50_FPN_3x_dconv*' 'Misc/cascade*3x.yaml' --name "Baseline R50-FPN" "Deformable Conv" "Cascade R-CNN" "Baseline R50-FPN" "Deformable Conv" "Cascade R-CNN" --fields lr_sched train_speed inference_speed mem box_AP mask_AP
-->
<table><tbody>
<!-- START TABLE -->
<!-- TABLE HEADER -->
<th valign="bottom">Name</th>
<th valign="bottom">lr<br/>sched</th>
<th valign="bottom">train<br/>time<br/>(s/iter)</th>
<th valign="bottom">inference<br/>time<br/>(s/im)</th>
<th valign="bottom">train<br/>mem<br/>(GB)</th>
<th valign="bottom">box<br/>AP</th>
<th valign="bottom">mask<br/>AP</th>
<th valign="bottom">model id</th>
<th valign="bottom">download</th>
<!-- TABLE BODY -->
<!-- ROW: mask_rcnn_R_50_FPN_1x -->
<tr><td align="left"><a href="configs/COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_1x.yaml">Baseline R50-FPN</a></td>
<td align="center">1x</td>
<td align="center">0.261</td>
<td align="center">0.043</td>
<td align="center">3.4</td>
<td align="center">38.6</td>
<td align="center">35.2</td>
<td align="center">137260431</td>
<td align="center"><a href="https://dl.fbaipublicfiles.com/detectron2/COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_1x/137260431/model_final_a54504.pkl">model</a>&nbsp;|&nbsp;<a href="https://dl.fbaipublicfiles.com/detectron2/COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_1x/137260431/metrics.json">metrics</a></td>
</tr>
<!-- ROW: mask_rcnn_R_50_FPN_1x_dconv_c3-c5 -->
<tr><td align="left"><a href="configs/Misc/mask_rcnn_R_50_FPN_1x_dconv_c3-c5.yaml">Deformable Conv</a></td>
<td align="center">1x</td>
<td align="center">0.342</td>
<td align="center">0.048</td>
<td align="center">3.5</td>
<td align="center">41.5</td>
<td align="center">37.5</td>
<td align="center">138602867</td>
<td align="center"><a href="https://dl.fbaipublicfiles.com/detectron2/Misc/mask_rcnn_R_50_FPN_1x_dconv_c3-c5/138602867/model_final_65c703.pkl">model</a>&nbsp;|&nbsp;<a href="https://dl.fbaipublicfiles.com/detectron2/Misc/mask_rcnn_R_50_FPN_1x_dconv_c3-c5/138602867/metrics.json">metrics</a></td>
</tr>
<!-- ROW: cascade_mask_rcnn_R_50_FPN_1x -->
<tr><td align="left"><a href="configs/Misc/cascade_mask_rcnn_R_50_FPN_1x.yaml">Cascade R-CNN</a></td>
<td align="center">1x</td>
<td align="center">0.317</td>
<td align="center">0.052</td>
<td align="center">4.0</td>
<td align="center">42.1</td>
<td align="center">36.4</td>
<td align="center">138602847</td>
<td align="center"><a href="https://dl.fbaipublicfiles.com/detectron2/Misc/cascade_mask_rcnn_R_50_FPN_1x/138602847/model_final_e9d89b.pkl">model</a>&nbsp;|&nbsp;<a href="https://dl.fbaipublicfiles.com/detectron2/Misc/cascade_mask_rcnn_R_50_FPN_1x/138602847/metrics.json">metrics</a></td>
</tr>
<!-- ROW: mask_rcnn_R_50_FPN_3x -->
<tr><td align="left"><a href="configs/COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x.yaml">Baseline R50-FPN</a></td>
<td align="center">3x</td>
<td align="center">0.261</td>
<td align="center">0.043</td>
<td align="center">3.4</td>
<td align="center">41.0</td>
<td align="center">37.2</td>
<td align="center">137849600</td>
<td align="center"><a href="https://dl.fbaipublicfiles.com/detectron2/COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x/137849600/model_final_f10217.pkl">model</a>&nbsp;|&nbsp;<a href="https://dl.fbaipublicfiles.com/detectron2/COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x/137849600/metrics.json">metrics</a></td>
</tr>
<!-- ROW: mask_rcnn_R_50_FPN_3x_dconv_c3-c5 -->
<tr><td align="left"><a href="configs/Misc/mask_rcnn_R_50_FPN_3x_dconv_c3-c5.yaml">Deformable Conv</a></td>
<td align="center">3x</td>
<td align="center">0.349</td>
<td align="center">0.047</td>
<td align="center">3.5</td>
<td align="center">42.7</td>
<td align="center">38.5</td>
<td align="center">144998336</td>
<td align="center"><a href="https://dl.fbaipublicfiles.com/detectron2/Misc/mask_rcnn_R_50_FPN_3x_dconv_c3-c5/144998336/model_final_821d0b.pkl">model</a>&nbsp;|&nbsp;<a href="https://dl.fbaipublicfiles.com/detectron2/Misc/mask_rcnn_R_50_FPN_3x_dconv_c3-c5/144998336/metrics.json">metrics</a></td>
</tr>
<!-- ROW: cascade_mask_rcnn_R_50_FPN_3x -->
<tr><td align="left"><a href="configs/Misc/cascade_mask_rcnn_R_50_FPN_3x.yaml">Cascade R-CNN</a></td>
<td align="center">3x</td>
<td align="center">0.328</td>
<td align="center">0.053</td>
<td align="center">4.0</td>
<td align="center">44.3</td>
<td align="center">38.5</td>
<td align="center">144998488</td>
<td align="center"><a href="https://dl.fbaipublicfiles.com/detectron2/Misc/cascade_mask_rcnn_R_50_FPN_3x/144998488/model_final_480dd8.pkl">model</a>&nbsp;|&nbsp;<a href="https://dl.fbaipublicfiles.com/detectron2/Misc/cascade_mask_rcnn_R_50_FPN_3x/144998488/metrics.json">metrics</a></td>
</tr>
</tbody></table>
Ablations for normalization methods, and a few models trained from scratch following [Rethinking ImageNet Pre-training](https://arxiv.org/abs/1811.08883).
(Note: The baseline uses `2fc` head while the others use [`4conv1fc` head](https://arxiv.org/abs/1803.08494))
<!--
./gen_html_table.py --config 'COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x.yaml' 'Misc/mask*50_FPN_3x_gn.yaml' 'Misc/mask*50_FPN_3x_syncbn.yaml' 'Misc/scratch*' --name "Baseline R50-FPN" "GN" "SyncBN" "GN (from scratch)" "GN (from scratch)" "SyncBN (from scratch)" --fields lr_sched train_speed inference_speed mem box_AP mask_AP
-->
<table><tbody>
<!-- START TABLE -->
<!-- TABLE HEADER -->
<th valign="bottom">Name</th>
<th valign="bottom">lr<br/>sched</th>
<th valign="bottom">train<br/>time<br/>(s/iter)</th>
<th valign="bottom">inference<br/>time<br/>(s/im)</th>
<th valign="bottom">train<br/>mem<br/>(GB)</th>
<th valign="bottom">box<br/>AP</th>
<th valign="bottom">mask<br/>AP</th>
<th valign="bottom">model id</th>
<th valign="bottom">download</th>
<!-- TABLE BODY -->
<!-- ROW: mask_rcnn_R_50_FPN_3x -->
<tr><td align="left"><a href="configs/COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x.yaml">Baseline R50-FPN</a></td>
<td align="center">3x</td>
<td align="center">0.261</td>
<td align="center">0.043</td>
<td align="center">3.4</td>
<td align="center">41.0</td>
<td align="center">37.2</td>
<td align="center">137849600</td>
<td align="center"><a href="https://dl.fbaipublicfiles.com/detectron2/COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x/137849600/model_final_f10217.pkl">model</a>&nbsp;|&nbsp;<a href="https://dl.fbaipublicfiles.com/detectron2/COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x/137849600/metrics.json">metrics</a></td>
</tr>
<!-- ROW: mask_rcnn_R_50_FPN_3x_gn -->
<tr><td align="left"><a href="configs/Misc/mask_rcnn_R_50_FPN_3x_gn.yaml">GN</a></td>
<td align="center">3x</td>
<td align="center">0.356</td>
<td align="center">0.069</td>
<td align="center">7.3</td>
<td align="center">42.6</td>
<td align="center">38.6</td>
<td align="center">138602888</td>
<td align="center"><a href="https://dl.fbaipublicfiles.com/detectron2/Misc/mask_rcnn_R_50_FPN_3x_gn/138602888/model_final_dc5d9e.pkl">model</a>&nbsp;|&nbsp;<a href="https://dl.fbaipublicfiles.com/detectron2/Misc/mask_rcnn_R_50_FPN_3x_gn/138602888/metrics.json">metrics</a></td>
</tr>
<!-- ROW: mask_rcnn_R_50_FPN_3x_syncbn -->
<tr><td align="left"><a href="configs/Misc/mask_rcnn_R_50_FPN_3x_syncbn.yaml">SyncBN</a></td>
<td align="center">3x</td>
<td align="center">0.371</td>
<td align="center">0.053</td>
<td align="center">5.5</td>
<td align="center">41.9</td>
<td align="center">37.8</td>
<td align="center">169527823</td>
<td align="center"><a href="https://dl.fbaipublicfiles.com/detectron2/Misc/mask_rcnn_R_50_FPN_3x_syncbn/169527823/model_final_3b3c51.pkl">model</a>&nbsp;|&nbsp;<a href="https://dl.fbaipublicfiles.com/detectron2/Misc/mask_rcnn_R_50_FPN_3x_syncbn/169527823/metrics.json">metrics</a></td>
</tr>
<!-- ROW: scratch_mask_rcnn_R_50_FPN_3x_gn -->
<tr><td align="left"><a href="configs/Misc/scratch_mask_rcnn_R_50_FPN_3x_gn.yaml">GN (from scratch)</a></td>
<td align="center">3x</td>
<td align="center">0.400</td>
<td align="center">0.069</td>
<td align="center">9.8</td>
<td align="center">39.9</td>
<td align="center">36.6</td>
<td align="center">138602908</td>
<td align="center"><a href="https://dl.fbaipublicfiles.com/detectron2/Misc/scratch_mask_rcnn_R_50_FPN_3x_gn/138602908/model_final_01ca85.pkl">model</a>&nbsp;|&nbsp;<a href="https://dl.fbaipublicfiles.com/detectron2/Misc/scratch_mask_rcnn_R_50_FPN_3x_gn/138602908/metrics.json">metrics</a></td>
</tr>
<!-- ROW: scratch_mask_rcnn_R_50_FPN_9x_gn -->
<tr><td align="left"><a href="configs/Misc/scratch_mask_rcnn_R_50_FPN_9x_gn.yaml">GN (from scratch)</a></td>
<td align="center">9x</td>
<td align="center">N/A</td>
<td align="center">0.070</td>
<td align="center">9.8</td>
<td align="center">43.7</td>
<td align="center">39.6</td>
<td align="center">183808979</td>
<td align="center"><a href="https://dl.fbaipublicfiles.com/detectron2/Misc/scratch_mask_rcnn_R_50_FPN_9x_gn/183808979/model_final_da7b4c.pkl">model</a>&nbsp;|&nbsp;<a href="https://dl.fbaipublicfiles.com/detectron2/Misc/scratch_mask_rcnn_R_50_FPN_9x_gn/183808979/metrics.json">metrics</a></td>
</tr>
<!-- ROW: scratch_mask_rcnn_R_50_FPN_9x_syncbn -->
<tr><td align="left"><a href="configs/Misc/scratch_mask_rcnn_R_50_FPN_9x_syncbn.yaml">SyncBN (from scratch)</a></td>
<td align="center">9x</td>
<td align="center">N/A</td>
<td align="center">0.055</td>
<td align="center">7.2</td>
<td align="center">43.6</td>
<td align="center">39.3</td>
<td align="center">184226666</td>
<td align="center"><a href="https://dl.fbaipublicfiles.com/detectron2/Misc/scratch_mask_rcnn_R_50_FPN_9x_syncbn/184226666/model_final_5ce33e.pkl">model</a>&nbsp;|&nbsp;<a href="https://dl.fbaipublicfiles.com/detectron2/Misc/scratch_mask_rcnn_R_50_FPN_9x_syncbn/184226666/metrics.json">metrics</a></td>
</tr>
</tbody></table>
A few very large models trained for a long time, for demo purposes. They are trained using multiple machines:
<!--
./gen_html_table.py --config 'Misc/panoptic_*dconv*' 'Misc/cascade_*152*' --name "Panoptic FPN R101" "Mask R-CNN X152" --fields inference_speed mem box_AP mask_AP PQ
# manually add TTA results
-->
<table><tbody>
<!-- START TABLE -->
<!-- TABLE HEADER -->
<th valign="bottom">Name</th>
<th valign="bottom">inference<br/>time<br/>(s/im)</th>
<th valign="bottom">train<br/>mem<br/>(GB)</th>
<th valign="bottom">box<br/>AP</th>
<th valign="bottom">mask<br/>AP</th>
<th valign="bottom">PQ</th>
<th valign="bottom">model id</th>
<th valign="bottom">download</th>
<!-- TABLE BODY -->
<!-- ROW: panoptic_fpn_R_101_dconv_cascade_gn_3x -->
<tr><td align="left"><a href="configs/Misc/panoptic_fpn_R_101_dconv_cascade_gn_3x.yaml">Panoptic FPN R101</a></td>
<td align="center">0.107</td>
<td align="center">11.4</td>
<td align="center">47.4</td>
<td align="center">41.3</td>
<td align="center">46.1</td>
<td align="center">139797668</td>
<td align="center"><a href="https://dl.fbaipublicfiles.com/detectron2/Misc/panoptic_fpn_R_101_dconv_cascade_gn_3x/139797668/model_final_be35db.pkl">model</a>&nbsp;|&nbsp;<a href="https://dl.fbaipublicfiles.com/detectron2/Misc/panoptic_fpn_R_101_dconv_cascade_gn_3x/139797668/metrics.json">metrics</a></td>
</tr>
<!-- ROW: cascade_mask_rcnn_X_152_32x8d_FPN_IN5k_gn_dconv -->
<tr><td align="left"><a href="configs/Misc/cascade_mask_rcnn_X_152_32x8d_FPN_IN5k_gn_dconv.yaml">Mask R-CNN X152</a></td>
<td align="center">0.242</td>
<td align="center">15.1</td>
<td align="center">50.2</td>
<td align="center">44.0</td>
<td align="center"></td>
<td align="center">18131413</td>
<td align="center"><a href="https://dl.fbaipublicfiles.com/detectron2/Misc/cascade_mask_rcnn_X_152_32x8d_FPN_IN5k_gn_dconv/18131413/model_0039999_e76410.pkl">model</a>&nbsp;|&nbsp;<a href="https://dl.fbaipublicfiles.com/detectron2/Misc/cascade_mask_rcnn_X_152_32x8d_FPN_IN5k_gn_dconv/18131413/metrics.json">metrics</a></td>
</tr>
<!-- ROW: TTA cascade_mask_rcnn_X_152_32x8d_FPN_IN5k_gn_dconv -->
<tr><td align="left">above + test-time aug.</td>
<td align="center"></td>
<td align="center"></td>
<td align="center">51.9</td>
<td align="center">45.9</td>
<td align="center"></td>
<td align="center"></td>
<td align="center"></td>
</tr>
</tbody></table>

View File

@ -0,0 +1,56 @@
<img src=".github/Detectron2-Logo-Horz.svg" width="300" >
Detectron2 is Facebook AI Research's next generation software system
that implements state-of-the-art object detection algorithms.
It is a ground-up rewrite of the previous version,
[Detectron](https://github.com/facebookresearch/Detectron/),
and it originates from [maskrcnn-benchmark](https://github.com/facebookresearch/maskrcnn-benchmark/).
<div align="center">
<img src="https://user-images.githubusercontent.com/1381301/66535560-d3422200-eace-11e9-9123-5535d469db19.png"/>
</div>
### What's New
* It is powered by the [PyTorch](https://pytorch.org) deep learning framework.
* Includes more features such as panoptic segmentation, densepose, Cascade R-CNN, rotated bounding boxes, etc.
* Can be used as a library to support [different projects](projects/) on top of it.
We'll open source more research projects in this way.
* It [trains much faster](https://detectron2.readthedocs.io/notes/benchmarks.html).
See our [blog post](https://ai.facebook.com/blog/-detectron2-a-pytorch-based-modular-object-detection-library-/)
to see more demos and learn about detectron2.
## Installation
See [INSTALL.md](INSTALL.md).
## Quick Start
See [GETTING_STARTED.md](GETTING_STARTED.md),
or the [Colab Notebook](https://colab.research.google.com/drive/16jcaJoc6bCFAQ96jDe2HwtXj7BMD_-m5).
Learn more at our [documentation](https://detectron2.readthedocs.org).
And see [projects/](projects/) for some projects that are built on top of detectron2.
## Model Zoo and Baselines
We provide a large set of baseline results and trained models available for download in the [Detectron2 Model Zoo](MODEL_ZOO.md).
## License
Detectron2 is released under the [Apache 2.0 license](LICENSE).
## Citing Detectron2
If you use Detectron2 in your research or wish to refer to the baseline results published in the [Model Zoo](MODEL_ZOO.md), please use the following BibTeX entry.
```BibTeX
@misc{wu2019detectron2,
author = {Yuxin Wu and Alexander Kirillov and Francisco Massa and
Wan-Yen Lo and Ross Girshick},
title = {Detectron2},
howpublished = {\url{https://github.com/facebookresearch/detectron2}},
year = {2019}
}
```

View File

@ -0,0 +1,18 @@
MODEL:
META_ARCHITECTURE: "GeneralizedRCNN"
RPN:
PRE_NMS_TOPK_TEST: 6000
POST_NMS_TOPK_TEST: 1000
ROI_HEADS:
NAME: "Res5ROIHeads"
DATASETS:
TRAIN: ("coco_2017_train",)
TEST: ("coco_2017_val",)
SOLVER:
IMS_PER_BATCH: 16
BASE_LR: 0.02
STEPS: (60000, 80000)
MAX_ITER: 90000
INPUT:
MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
VERSION: 2

View File

@ -0,0 +1,31 @@
MODEL:
META_ARCHITECTURE: "GeneralizedRCNN"
RESNETS:
OUT_FEATURES: ["res5"]
RES5_DILATION: 2
RPN:
IN_FEATURES: ["res5"]
PRE_NMS_TOPK_TEST: 6000
POST_NMS_TOPK_TEST: 1000
ROI_HEADS:
NAME: "StandardROIHeads"
IN_FEATURES: ["res5"]
ROI_BOX_HEAD:
NAME: "FastRCNNConvFCHead"
NUM_FC: 2
POOLER_RESOLUTION: 7
ROI_MASK_HEAD:
NAME: "MaskRCNNConvUpsampleHead"
NUM_CONV: 4
POOLER_RESOLUTION: 14
DATASETS:
TRAIN: ("coco_2017_train",)
TEST: ("coco_2017_val",)
SOLVER:
IMS_PER_BATCH: 16
BASE_LR: 0.02
STEPS: (60000, 80000)
MAX_ITER: 90000
INPUT:
MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
VERSION: 2

View File

@ -0,0 +1,42 @@
MODEL:
META_ARCHITECTURE: "GeneralizedRCNN"
BACKBONE:
NAME: "build_resnet_fpn_backbone"
RESNETS:
OUT_FEATURES: ["res2", "res3", "res4", "res5"]
FPN:
IN_FEATURES: ["res2", "res3", "res4", "res5"]
ANCHOR_GENERATOR:
SIZES: [[32], [64], [128], [256], [512]] # One size for each in feature map
ASPECT_RATIOS: [[0.5, 1.0, 2.0]] # Three aspect ratios (same for all in feature maps)
RPN:
IN_FEATURES: ["p2", "p3", "p4", "p5", "p6"]
PRE_NMS_TOPK_TRAIN: 2000 # Per FPN level
PRE_NMS_TOPK_TEST: 1000 # Per FPN level
# Detectron1 uses 2000 proposals per-batch,
# (See "modeling/rpn/rpn_outputs.py" for details of this legacy issue)
# which is approximately 1000 proposals per-image since the default batch size for FPN is 2.
POST_NMS_TOPK_TRAIN: 1000
POST_NMS_TOPK_TEST: 1000
ROI_HEADS:
NAME: "StandardROIHeads"
IN_FEATURES: ["p2", "p3", "p4", "p5"]
ROI_BOX_HEAD:
NAME: "FastRCNNConvFCHead"
NUM_FC: 2
POOLER_RESOLUTION: 7
ROI_MASK_HEAD:
NAME: "MaskRCNNConvUpsampleHead"
NUM_CONV: 4
POOLER_RESOLUTION: 14
DATASETS:
TRAIN: ("coco_2017_train",)
TEST: ("coco_2017_val",)
SOLVER:
IMS_PER_BATCH: 16
BASE_LR: 0.02
STEPS: (60000, 80000)
MAX_ITER: 90000
INPUT:
MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
VERSION: 2

View File

@ -0,0 +1,24 @@
MODEL:
META_ARCHITECTURE: "RetinaNet"
BACKBONE:
NAME: "build_retinanet_resnet_fpn_backbone"
RESNETS:
OUT_FEATURES: ["res3", "res4", "res5"]
ANCHOR_GENERATOR:
SIZES: !!python/object/apply:eval ["[[x, x * 2**(1.0/3), x * 2**(2.0/3) ] for x in [32, 64, 128, 256, 512 ]]"]
FPN:
IN_FEATURES: ["res3", "res4", "res5"]
RETINANET:
IOU_THRESHOLDS: [0.4, 0.5]
IOU_LABELS: [0, -1, 1]
DATASETS:
TRAIN: ("coco_2017_train",)
TEST: ("coco_2017_val",)
SOLVER:
IMS_PER_BATCH: 16
BASE_LR: 0.01 # Note that RetinaNet uses a different default learning rate
STEPS: (60000, 80000)
MAX_ITER: 90000
INPUT:
MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
VERSION: 2

View File

@ -0,0 +1,17 @@
_BASE_: "../Base-RCNN-FPN.yaml"
MODEL:
WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
MASK_ON: False
LOAD_PROPOSALS: True
RESNETS:
DEPTH: 50
PROPOSAL_GENERATOR:
NAME: "PrecomputedProposals"
DATASETS:
TRAIN: ("coco_2017_train",)
PROPOSAL_FILES_TRAIN: ("detectron2://COCO-Detection/rpn_R_50_FPN_1x/137258492/coco_2017_train_box_proposals_21bc3a.pkl", )
TEST: ("coco_2017_val",)
PROPOSAL_FILES_TEST: ("detectron2://COCO-Detection/rpn_R_50_FPN_1x/137258492/coco_2017_val_box_proposals_ee0dad.pkl", )
DATALOADER:
# proposals are part of the dataset_dicts, and take a lot of RAM
NUM_WORKERS: 2

View File

@ -0,0 +1,9 @@
_BASE_: "../Base-RCNN-C4.yaml"
MODEL:
WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-101.pkl"
MASK_ON: False
RESNETS:
DEPTH: 101
SOLVER:
STEPS: (210000, 250000)
MAX_ITER: 270000

View File

@ -0,0 +1,9 @@
_BASE_: "../Base-RCNN-DilatedC5.yaml"
MODEL:
WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-101.pkl"
MASK_ON: False
RESNETS:
DEPTH: 101
SOLVER:
STEPS: (210000, 250000)
MAX_ITER: 270000

View File

@ -0,0 +1,9 @@
_BASE_: "../Base-RCNN-FPN.yaml"
MODEL:
WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-101.pkl"
MASK_ON: False
RESNETS:
DEPTH: 101
SOLVER:
STEPS: (210000, 250000)
MAX_ITER: 270000

View File

@ -0,0 +1,6 @@
_BASE_: "../Base-RCNN-C4.yaml"
MODEL:
WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
MASK_ON: False
RESNETS:
DEPTH: 50

View File

@ -0,0 +1,9 @@
_BASE_: "../Base-RCNN-C4.yaml"
MODEL:
WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
MASK_ON: False
RESNETS:
DEPTH: 50
SOLVER:
STEPS: (210000, 250000)
MAX_ITER: 270000

View File

@ -0,0 +1,6 @@
_BASE_: "../Base-RCNN-DilatedC5.yaml"
MODEL:
WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
MASK_ON: False
RESNETS:
DEPTH: 50

View File

@ -0,0 +1,9 @@
_BASE_: "../Base-RCNN-DilatedC5.yaml"
MODEL:
WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
MASK_ON: False
RESNETS:
DEPTH: 50
SOLVER:
STEPS: (210000, 250000)
MAX_ITER: 270000

View File

@ -0,0 +1,6 @@
_BASE_: "../Base-RCNN-FPN.yaml"
MODEL:
WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
MASK_ON: False
RESNETS:
DEPTH: 50

View File

@ -0,0 +1,9 @@
_BASE_: "../Base-RCNN-FPN.yaml"
MODEL:
WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
MASK_ON: False
RESNETS:
DEPTH: 50
SOLVER:
STEPS: (210000, 250000)
MAX_ITER: 270000

View File

@ -0,0 +1,13 @@
_BASE_: "../Base-RCNN-FPN.yaml"
MODEL:
MASK_ON: False
WEIGHTS: "detectron2://ImageNetPretrained/FAIR/X-101-32x8d.pkl"
PIXEL_STD: [57.375, 57.120, 58.395]
RESNETS:
STRIDE_IN_1X1: False # this is a C2 model
NUM_GROUPS: 32
WIDTH_PER_GROUP: 8
DEPTH: 101
SOLVER:
STEPS: (210000, 250000)
MAX_ITER: 270000

View File

@ -0,0 +1,8 @@
_BASE_: "../Base-RetinaNet.yaml"
MODEL:
WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-101.pkl"
RESNETS:
DEPTH: 101
SOLVER:
STEPS: (210000, 250000)
MAX_ITER: 270000

View File

@ -0,0 +1,5 @@
_BASE_: "../Base-RetinaNet.yaml"
MODEL:
WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
RESNETS:
DEPTH: 50

View File

@ -0,0 +1,8 @@
_BASE_: "../Base-RetinaNet.yaml"
MODEL:
WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
RESNETS:
DEPTH: 50
SOLVER:
STEPS: (210000, 250000)
MAX_ITER: 270000

View File

@ -0,0 +1,10 @@
_BASE_: "../Base-RCNN-C4.yaml"
MODEL:
META_ARCHITECTURE: "ProposalNetwork"
WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
MASK_ON: False
RESNETS:
DEPTH: 50
RPN:
PRE_NMS_TOPK_TEST: 12000
POST_NMS_TOPK_TEST: 2000

View File

@ -0,0 +1,9 @@
_BASE_: "../Base-RCNN-FPN.yaml"
MODEL:
META_ARCHITECTURE: "ProposalNetwork"
WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
MASK_ON: False
RESNETS:
DEPTH: 50
RPN:
POST_NMS_TOPK_TEST: 2000

View File

@ -0,0 +1,9 @@
_BASE_: "../Base-RCNN-C4.yaml"
MODEL:
WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-101.pkl"
MASK_ON: True
RESNETS:
DEPTH: 101
SOLVER:
STEPS: (210000, 250000)
MAX_ITER: 270000

View File

@ -0,0 +1,9 @@
_BASE_: "../Base-RCNN-DilatedC5.yaml"
MODEL:
WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-101.pkl"
MASK_ON: True
RESNETS:
DEPTH: 101
SOLVER:
STEPS: (210000, 250000)
MAX_ITER: 270000

View File

@ -0,0 +1,9 @@
_BASE_: "../Base-RCNN-FPN.yaml"
MODEL:
WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-101.pkl"
MASK_ON: True
RESNETS:
DEPTH: 101
SOLVER:
STEPS: (210000, 250000)
MAX_ITER: 270000

View File

@ -0,0 +1,6 @@
_BASE_: "../Base-RCNN-C4.yaml"
MODEL:
WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
MASK_ON: True
RESNETS:
DEPTH: 50

View File

@ -0,0 +1,9 @@
_BASE_: "../Base-RCNN-C4.yaml"
MODEL:
WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
MASK_ON: True
RESNETS:
DEPTH: 50
SOLVER:
STEPS: (210000, 250000)
MAX_ITER: 270000

View File

@ -0,0 +1,6 @@
_BASE_: "../Base-RCNN-DilatedC5.yaml"
MODEL:
WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
MASK_ON: True
RESNETS:
DEPTH: 50

View File

@ -0,0 +1,9 @@
_BASE_: "../Base-RCNN-DilatedC5.yaml"
MODEL:
WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
MASK_ON: True
RESNETS:
DEPTH: 50
SOLVER:
STEPS: (210000, 250000)
MAX_ITER: 270000

View File

@ -0,0 +1,6 @@
_BASE_: "../Base-RCNN-FPN.yaml"
MODEL:
WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
MASK_ON: True
RESNETS:
DEPTH: 50

View File

@ -0,0 +1,9 @@
_BASE_: "../Base-RCNN-FPN.yaml"
MODEL:
WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
MASK_ON: True
RESNETS:
DEPTH: 50
SOLVER:
STEPS: (210000, 250000)
MAX_ITER: 270000

View File

@ -0,0 +1,13 @@
_BASE_: "../Base-RCNN-FPN.yaml"
MODEL:
MASK_ON: True
WEIGHTS: "detectron2://ImageNetPretrained/FAIR/X-101-32x8d.pkl"
PIXEL_STD: [57.375, 57.120, 58.395]
RESNETS:
STRIDE_IN_1X1: False # this is a C2 model
NUM_GROUPS: 32
WIDTH_PER_GROUP: 8
DEPTH: 101
SOLVER:
STEPS: (210000, 250000)
MAX_ITER: 270000

View File

@ -0,0 +1,15 @@
_BASE_: "../Base-RCNN-FPN.yaml"
MODEL:
KEYPOINT_ON: True
ROI_HEADS:
NUM_CLASSES: 1
ROI_BOX_HEAD:
SMOOTH_L1_BETA: 0.5 # Keypoint AP degrades (though box AP improves) when using plain L1 loss
RPN:
# Detectron1 uses 2000 proposals per-batch, but this option is per-image in detectron2.
# 1000 proposals per-image is found to hurt box AP.
# Therefore we increase it to 1500 per-image.
POST_NMS_TOPK_TRAIN: 1500
DATASETS:
TRAIN: ("keypoints_coco_2017_train",)
TEST: ("keypoints_coco_2017_val",)

View File

@ -0,0 +1,8 @@
_BASE_: "Base-Keypoint-RCNN-FPN.yaml"
MODEL:
WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-101.pkl"
RESNETS:
DEPTH: 101
SOLVER:
STEPS: (210000, 250000)
MAX_ITER: 270000

View File

@ -0,0 +1,5 @@
_BASE_: "Base-Keypoint-RCNN-FPN.yaml"
MODEL:
WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
RESNETS:
DEPTH: 50

View File

@ -0,0 +1,8 @@
_BASE_: "Base-Keypoint-RCNN-FPN.yaml"
MODEL:
WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
RESNETS:
DEPTH: 50
SOLVER:
STEPS: (210000, 250000)
MAX_ITER: 270000

View File

@ -0,0 +1,12 @@
_BASE_: "Base-Keypoint-RCNN-FPN.yaml"
MODEL:
WEIGHTS: "detectron2://ImageNetPretrained/FAIR/X-101-32x8d.pkl"
PIXEL_STD: [57.375, 57.120, 58.395]
RESNETS:
STRIDE_IN_1X1: False # this is a C2 model
NUM_GROUPS: 32
WIDTH_PER_GROUP: 8
DEPTH: 101
SOLVER:
STEPS: (210000, 250000)
MAX_ITER: 270000

View File

@ -0,0 +1,9 @@
_BASE_: "../Base-RCNN-FPN.yaml"
MODEL:
META_ARCHITECTURE: "PanopticFPN"
MASK_ON: True
SEM_SEG_HEAD:
LOSS_WEIGHT: 0.5
DATASETS:
TRAIN: ("coco_2017_train_panoptic_separated",)
TEST: ("coco_2017_val_panoptic_separated",)

View File

@ -0,0 +1,8 @@
_BASE_: "Base-Panoptic-FPN.yaml"
MODEL:
WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-101.pkl"
RESNETS:
DEPTH: 101
SOLVER:
STEPS: (210000, 250000)
MAX_ITER: 270000

View File

@ -0,0 +1,5 @@
_BASE_: "Base-Panoptic-FPN.yaml"
MODEL:
WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
RESNETS:
DEPTH: 50

View File

@ -0,0 +1,8 @@
_BASE_: "Base-Panoptic-FPN.yaml"
MODEL:
WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
RESNETS:
DEPTH: 50
SOLVER:
STEPS: (210000, 250000)
MAX_ITER: 270000

View File

@ -0,0 +1,27 @@
_BASE_: "../Base-RCNN-FPN.yaml"
MODEL:
# WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
# For better, more stable performance initialize from COCO
WEIGHTS: "detectron2://COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x/137849600/model_final_f10217.pkl"
MASK_ON: True
ROI_HEADS:
NUM_CLASSES: 8
# This is similar to the setting used in Mask R-CNN paper, Appendix A
# But there are some differences, e.g., we did not initialize the output
# layer using the corresponding classes from COCO
INPUT:
MIN_SIZE_TRAIN: (800, 832, 864, 896, 928, 960, 992, 1024)
MIN_SIZE_TRAIN_SAMPLING: "choice"
MIN_SIZE_TEST: 1024
MAX_SIZE_TRAIN: 2048
MAX_SIZE_TEST: 2048
DATASETS:
TRAIN: ("cityscapes_fine_instance_seg_train",)
TEST: ("cityscapes_fine_instance_seg_val",)
SOLVER:
BASE_LR: 0.01
STEPS: (18000,)
MAX_ITER: 24000
IMS_PER_BATCH: 8
TEST:
EVAL_PERIOD: 8000

View File

@ -0,0 +1,83 @@
Detectron2 model zoo's experimental settings and a few implementation details are different from Detectron.
The differences in implementation details are shared in
[Compatibility with Other Libraries](../../docs/notes/compatibility.md).
The differences in model zoo's experimental settings include:
* Use scale augmentation during training. This improves AP with lower training cost.
* Use L1 loss instead of smooth L1 loss for simplicity. This sometimes improves box AP but may
affect other AP.
* Use `POOLER_SAMPLING_RATIO=0` instead of 2. This does not significantly affect AP.
* Use `ROIAlignV2`. This does not significantly affect AP.
In this directory, we provide a few configs that __do not__ have the above changes.
They mimic Detectron's behavior as close as possible,
and provide a fair comparison of accuracy and speed against Detectron.
<!--
./gen_html_table.py --config 'Detectron1-Comparisons/*.yaml' --name "Faster R-CNN" "Keypoint R-CNN" "Mask R-CNN" --fields lr_sched train_speed inference_speed mem box_AP mask_AP keypoint_AP --base-dir ../../../configs/Detectron1-Comparisons
-->
<table><tbody>
<!-- START TABLE -->
<!-- TABLE HEADER -->
<th valign="bottom">Name</th>
<th valign="bottom">lr<br/>sched</th>
<th valign="bottom">train<br/>time<br/>(s/iter)</th>
<th valign="bottom">inference<br/>time<br/>(s/im)</th>
<th valign="bottom">train<br/>mem<br/>(GB)</th>
<th valign="bottom">box<br/>AP</th>
<th valign="bottom">mask<br/>AP</th>
<th valign="bottom">kp.<br/>AP</th>
<th valign="bottom">model id</th>
<th valign="bottom">download</th>
<!-- TABLE BODY -->
<!-- ROW: faster_rcnn_R_50_FPN_noaug_1x -->
<tr><td align="left"><a href="faster_rcnn_R_50_FPN_noaug_1x.yaml">Faster R-CNN</a></td>
<td align="center">1x</td>
<td align="center">0.219</td>
<td align="center">0.038</td>
<td align="center">3.1</td>
<td align="center">36.9</td>
<td align="center"></td>
<td align="center"></td>
<td align="center">137781054</td>
<td align="center"><a href="https://dl.fbaipublicfiles.com/detectron2/Detectron1-Comparisons/faster_rcnn_R_50_FPN_noaug_1x/137781054/model_final_7ab50c.pkl">model</a>&nbsp;|&nbsp;<a href="https://dl.fbaipublicfiles.com/detectron2/Detectron1-Comparisons/faster_rcnn_R_50_FPN_noaug_1x/137781054/metrics.json">metrics</a></td>
</tr>
<!-- ROW: keypoint_rcnn_R_50_FPN_1x -->
<tr><td align="left"><a href="keypoint_rcnn_R_50_FPN_1x.yaml">Keypoint R-CNN</a></td>
<td align="center">1x</td>
<td align="center">0.313</td>
<td align="center">0.071</td>
<td align="center">5.0</td>
<td align="center">53.1</td>
<td align="center"></td>
<td align="center">64.2</td>
<td align="center">137781195</td>
<td align="center"><a href="https://dl.fbaipublicfiles.com/detectron2/Detectron1-Comparisons/keypoint_rcnn_R_50_FPN_1x/137781195/model_final_cce136.pkl">model</a>&nbsp;|&nbsp;<a href="https://dl.fbaipublicfiles.com/detectron2/Detectron1-Comparisons/keypoint_rcnn_R_50_FPN_1x/137781195/metrics.json">metrics</a></td>
</tr>
<!-- ROW: mask_rcnn_R_50_FPN_noaug_1x -->
<tr><td align="left"><a href="mask_rcnn_R_50_FPN_noaug_1x.yaml">Mask R-CNN</a></td>
<td align="center">1x</td>
<td align="center">0.273</td>
<td align="center">0.043</td>
<td align="center">3.4</td>
<td align="center">37.8</td>
<td align="center">34.9</td>
<td align="center"></td>
<td align="center">137781281</td>
<td align="center"><a href="https://dl.fbaipublicfiles.com/detectron2/Detectron1-Comparisons/mask_rcnn_R_50_FPN_noaug_1x/137781281/model_final_62ca52.pkl">model</a>&nbsp;|&nbsp;<a href="https://dl.fbaipublicfiles.com/detectron2/Detectron1-Comparisons/mask_rcnn_R_50_FPN_noaug_1x/137781281/metrics.json">metrics</a></td>
</tr>
</tbody></table>
## Comparisons:
* Faster R-CNN: Detectron's AP is 36.7, similar to ours.
* Keypoint R-CNN: Detectron's AP is box 53.6, keypoint 64.2. Fixing a Detectron's
[bug](https://github.com/facebookresearch/Detectron/issues/459) lead to a drop in box AP, and can be
compensated back by some parameter tuning.
* Mask R-CNN: Detectron's AP is box 37.7, mask 33.9. We're 1 AP better in mask AP, due to more correct implementation.
For speed comparison, see [benchmarks](https://detectron2.readthedocs.io/notes/benchmarks.html).

View File

@ -0,0 +1,17 @@
_BASE_: "../Base-RCNN-FPN.yaml"
MODEL:
WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
MASK_ON: False
RESNETS:
DEPTH: 50
# Detectron1 uses smooth L1 loss with some magic beta values.
# The defaults are changed to L1 loss in Detectron2.
RPN:
SMOOTH_L1_BETA: 0.1111
ROI_BOX_HEAD:
SMOOTH_L1_BETA: 1.0
POOLER_SAMPLING_RATIO: 2
POOLER_TYPE: "ROIAlign"
INPUT:
# no scale augmentation
MIN_SIZE_TRAIN: (800, )

View File

@ -0,0 +1,27 @@
_BASE_: "../Base-RCNN-FPN.yaml"
MODEL:
WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
KEYPOINT_ON: True
RESNETS:
DEPTH: 50
ROI_HEADS:
NUM_CLASSES: 1
ROI_KEYPOINT_HEAD:
POOLER_RESOLUTION: 14
POOLER_SAMPLING_RATIO: 2
POOLER_TYPE: "ROIAlign"
# Detectron1 uses smooth L1 loss with some magic beta values.
# The defaults are changed to L1 loss in Detectron2.
ROI_BOX_HEAD:
SMOOTH_L1_BETA: 1.0
POOLER_SAMPLING_RATIO: 2
POOLER_TYPE: "ROIAlign"
RPN:
SMOOTH_L1_BETA: 0.1111
# Detectron1 uses 2000 proposals per-batch, but this option is per-image in detectron2
# 1000 proposals per-image is found to hurt box AP.
# Therefore we increase it to 1500 per-image.
POST_NMS_TOPK_TRAIN: 1500
DATASETS:
TRAIN: ("keypoints_coco_2017_train",)
TEST: ("keypoints_coco_2017_val",)

View File

@ -0,0 +1,20 @@
_BASE_: "../Base-RCNN-FPN.yaml"
MODEL:
WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
MASK_ON: True
RESNETS:
DEPTH: 50
# Detectron1 uses smooth L1 loss with some magic beta values.
# The defaults are changed to L1 loss in Detectron2.
RPN:
SMOOTH_L1_BETA: 0.1111
ROI_BOX_HEAD:
SMOOTH_L1_BETA: 1.0
POOLER_SAMPLING_RATIO: 2
POOLER_TYPE: "ROIAlign"
ROI_MASK_HEAD:
POOLER_SAMPLING_RATIO: 2
POOLER_TYPE: "ROIAlign"
INPUT:
# no scale augmentation
MIN_SIZE_TRAIN: (800, )

View File

@ -0,0 +1,19 @@
_BASE_: "../Base-RCNN-FPN.yaml"
MODEL:
WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-101.pkl"
MASK_ON: True
RESNETS:
DEPTH: 101
ROI_HEADS:
NUM_CLASSES: 1230
SCORE_THRESH_TEST: 0.0001
INPUT:
MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
DATASETS:
TRAIN: ("lvis_v0.5_train",)
TEST: ("lvis_v0.5_val",)
TEST:
DETECTIONS_PER_IMAGE: 300 # LVIS allows up to 300
DATALOADER:
SAMPLER_TRAIN: "RepeatFactorTrainingSampler"
REPEAT_THRESHOLD: 0.001

View File

@ -0,0 +1,19 @@
_BASE_: "../Base-RCNN-FPN.yaml"
MODEL:
WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
MASK_ON: True
RESNETS:
DEPTH: 50
ROI_HEADS:
NUM_CLASSES: 1230
SCORE_THRESH_TEST: 0.0001
INPUT:
MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
DATASETS:
TRAIN: ("lvis_v0.5_train",)
TEST: ("lvis_v0.5_val",)
TEST:
DETECTIONS_PER_IMAGE: 300 # LVIS allows up to 300
DATALOADER:
SAMPLER_TRAIN: "RepeatFactorTrainingSampler"
REPEAT_THRESHOLD: 0.001

View File

@ -0,0 +1,23 @@
_BASE_: "../Base-RCNN-FPN.yaml"
MODEL:
WEIGHTS: "detectron2://ImageNetPretrained/FAIR/X-101-32x8d.pkl"
PIXEL_STD: [57.375, 57.120, 58.395]
MASK_ON: True
RESNETS:
STRIDE_IN_1X1: False # this is a C2 model
NUM_GROUPS: 32
WIDTH_PER_GROUP: 8
DEPTH: 101
ROI_HEADS:
NUM_CLASSES: 1230
SCORE_THRESH_TEST: 0.0001
INPUT:
MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
DATASETS:
TRAIN: ("lvis_v0.5_train",)
TEST: ("lvis_v0.5_val",)
TEST:
DETECTIONS_PER_IMAGE: 300 # LVIS allows up to 300
DATALOADER:
SAMPLER_TRAIN: "RepeatFactorTrainingSampler"
REPEAT_THRESHOLD: 0.001

View File

@ -0,0 +1,12 @@
_BASE_: "../Base-RCNN-FPN.yaml"
MODEL:
WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
MASK_ON: True
RESNETS:
DEPTH: 50
ROI_HEADS:
NAME: CascadeROIHeads
ROI_BOX_HEAD:
CLS_AGNOSTIC_BBOX_REG: True
RPN:
POST_NMS_TOPK_TRAIN: 2000

View File

@ -0,0 +1,15 @@
_BASE_: "../Base-RCNN-FPN.yaml"
MODEL:
WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
MASK_ON: True
RESNETS:
DEPTH: 50
ROI_HEADS:
NAME: CascadeROIHeads
ROI_BOX_HEAD:
CLS_AGNOSTIC_BBOX_REG: True
RPN:
POST_NMS_TOPK_TRAIN: 2000
SOLVER:
STEPS: (210000, 250000)
MAX_ITER: 270000

View File

@ -0,0 +1,36 @@
_BASE_: "../Base-RCNN-FPN.yaml"
MODEL:
MASK_ON: True
WEIGHTS: "catalog://ImageNetPretrained/FAIR/X-152-32x8d-IN5k"
RESNETS:
STRIDE_IN_1X1: False # this is a C2 model
NUM_GROUPS: 32
WIDTH_PER_GROUP: 8
DEPTH: 152
DEFORM_ON_PER_STAGE: [False, True, True, True]
ROI_HEADS:
NAME: "CascadeROIHeads"
ROI_BOX_HEAD:
NAME: "FastRCNNConvFCHead"
NUM_CONV: 4
NUM_FC: 1
NORM: "GN"
CLS_AGNOSTIC_BBOX_REG: True
ROI_MASK_HEAD:
NUM_CONV: 8
NORM: "GN"
RPN:
POST_NMS_TOPK_TRAIN: 2000
SOLVER:
IMS_PER_BATCH: 128
STEPS: (35000, 45000)
MAX_ITER: 50000
BASE_LR: 0.16
INPUT:
MIN_SIZE_TRAIN: (640, 864)
MIN_SIZE_TRAIN_SAMPLING: "range"
MAX_SIZE_TRAIN: 1440
CROP:
ENABLED: True
TEST:
EVAL_PERIOD: 2500

View File

@ -0,0 +1,42 @@
_BASE_: "../Base-RCNN-FPN.yaml"
MODEL:
MASK_ON: True
# WEIGHTS: "catalog://ImageNetPretrained/FAIR/X-152-32x8d-IN5k"
WEIGHTS: "model_0039999_e76410.pkl"
RESNETS:
STRIDE_IN_1X1: False # this is a C2 model
NUM_GROUPS: 32
WIDTH_PER_GROUP: 8
DEPTH: 152
DEFORM_ON_PER_STAGE: [False, True, True, True]
ROI_HEADS:
NAME: "CascadeROIHeads"
NUM_CLASSES: 1
ROI_BOX_HEAD:
NAME: "FastRCNNConvFCHead"
NUM_CONV: 4
NUM_FC: 1
NORM: "GN"
CLS_AGNOSTIC_BBOX_REG: True
ROI_MASK_HEAD:
NUM_CONV: 8
NORM: "GN"
RPN:
POST_NMS_TOPK_TRAIN: 2000
SOLVER:
# IMS_PER_BATCH: 128
IMS_PER_BATCH: 1
STEPS: (35000, 45000)
MAX_ITER: 50000
BASE_LR: 0.16
INPUT:
MIN_SIZE_TRAIN: (640, 864)
MIN_SIZE_TRAIN_SAMPLING: "range"
MAX_SIZE_TRAIN: 1440
CROP:
ENABLED: True
TEST:
EVAL_PERIOD: 2500
DATASETS:
TRAIN: ("CIHP_train","VIP_trainval")
TEST: ("CIHP_val",)

View File

@ -0,0 +1,25 @@
_BASE_: "cascade_mask_rcnn_X_152_32x8d_FPN_IN5k_gn_dconv.yaml"
MODEL:
MASK_ON: True
ROI_HEADS:
NMS_THRESH_TEST: 0.95
SCORE_THRESH_TEST: 0.5
NUM_CLASSES: 1
SOLVER:
IMS_PER_BATCH: 1
STEPS: (30000, 45000)
MAX_ITER: 50000
BASE_LR: 0.02
INPUT:
MIN_SIZE_TRAIN: (640, 864)
MIN_SIZE_TRAIN_SAMPLING: "range"
MAX_SIZE_TRAIN: 1440
CROP:
ENABLED: True
TEST:
AUG:
ENABLED: True
DATASETS:
TRAIN: ("demo_train",)
TEST: ("demo_val",)
OUTPUT_DIR: "../../data/DemoDataset/detectron2_prediction"

View File

@ -0,0 +1,10 @@
_BASE_: "../Base-RCNN-FPN.yaml"
MODEL:
WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
MASK_ON: True
RESNETS:
DEPTH: 50
ROI_BOX_HEAD:
CLS_AGNOSTIC_BBOX_REG: True
ROI_MASK_HEAD:
CLS_AGNOSTIC_MASK: True

View File

@ -0,0 +1,8 @@
_BASE_: "../Base-RCNN-FPN.yaml"
MODEL:
WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
MASK_ON: True
RESNETS:
DEPTH: 50
DEFORM_ON_PER_STAGE: [False, True, True, True] # on Res3,Res4,Res5
DEFORM_MODULATED: False

View File

@ -0,0 +1,11 @@
_BASE_: "../Base-RCNN-FPN.yaml"
MODEL:
WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
MASK_ON: True
RESNETS:
DEPTH: 50
DEFORM_ON_PER_STAGE: [False, True, True, True] # on Res3,Res4,Res5
DEFORM_MODULATED: False
SOLVER:
STEPS: (210000, 250000)
MAX_ITER: 270000

Some files were not shown because too many files have changed in this diff Show More