Add at new repo again
This commit is contained in:
5
vton-api/preprocess/humanparsing/modules/__init__.py
Normal file
5
vton-api/preprocess/humanparsing/modules/__init__.py
Normal file
@@ -0,0 +1,5 @@
|
||||
from .bn import ABN, InPlaceABN, InPlaceABNSync
|
||||
from .functions import ACT_RELU, ACT_LEAKY_RELU, ACT_ELU, ACT_NONE
|
||||
from .misc import GlobalAvgPool2d, SingleGPU
|
||||
from .residual import IdentityResidualBlock
|
||||
from .dense import DenseModule
|
132
vton-api/preprocess/humanparsing/modules/bn.py
Normal file
132
vton-api/preprocess/humanparsing/modules/bn.py
Normal file
@@ -0,0 +1,132 @@
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import torch.nn.functional as functional
|
||||
|
||||
try:
|
||||
from queue import Queue
|
||||
except ImportError:
|
||||
from Queue import Queue
|
||||
|
||||
from .functions import *
|
||||
|
||||
|
||||
class ABN(nn.Module):
|
||||
"""Activated Batch Normalization
|
||||
|
||||
This gathers a `BatchNorm2d` and an activation function in a single module
|
||||
"""
|
||||
|
||||
def __init__(self, num_features, eps=1e-5, momentum=0.1, affine=True, activation="leaky_relu", slope=0.01):
|
||||
"""Creates an Activated Batch Normalization module
|
||||
|
||||
Parameters
|
||||
----------
|
||||
num_features : int
|
||||
Number of feature channels in the input and output.
|
||||
eps : float
|
||||
Small constant to prevent numerical issues.
|
||||
momentum : float
|
||||
Momentum factor applied to compute running statistics as.
|
||||
affine : bool
|
||||
If `True` apply learned scale and shift transformation after normalization.
|
||||
activation : str
|
||||
Name of the activation functions, one of: `leaky_relu`, `elu` or `none`.
|
||||
slope : float
|
||||
Negative slope for the `leaky_relu` activation.
|
||||
"""
|
||||
super(ABN, self).__init__()
|
||||
self.num_features = num_features
|
||||
self.affine = affine
|
||||
self.eps = eps
|
||||
self.momentum = momentum
|
||||
self.activation = activation
|
||||
self.slope = slope
|
||||
if self.affine:
|
||||
self.weight = nn.Parameter(torch.ones(num_features))
|
||||
self.bias = nn.Parameter(torch.zeros(num_features))
|
||||
else:
|
||||
self.register_parameter('weight', None)
|
||||
self.register_parameter('bias', None)
|
||||
self.register_buffer('running_mean', torch.zeros(num_features))
|
||||
self.register_buffer('running_var', torch.ones(num_features))
|
||||
self.reset_parameters()
|
||||
|
||||
def reset_parameters(self):
|
||||
nn.init.constant_(self.running_mean, 0)
|
||||
nn.init.constant_(self.running_var, 1)
|
||||
if self.affine:
|
||||
nn.init.constant_(self.weight, 1)
|
||||
nn.init.constant_(self.bias, 0)
|
||||
|
||||
def forward(self, x):
|
||||
x = functional.batch_norm(x, self.running_mean, self.running_var, self.weight, self.bias,
|
||||
self.training, self.momentum, self.eps)
|
||||
|
||||
if self.activation == ACT_RELU:
|
||||
return functional.relu(x, inplace=True)
|
||||
elif self.activation == ACT_LEAKY_RELU:
|
||||
return functional.leaky_relu(x, negative_slope=self.slope, inplace=True)
|
||||
elif self.activation == ACT_ELU:
|
||||
return functional.elu(x, inplace=True)
|
||||
else:
|
||||
return x
|
||||
|
||||
def __repr__(self):
|
||||
rep = '{name}({num_features}, eps={eps}, momentum={momentum},' \
|
||||
' affine={affine}, activation={activation}'
|
||||
if self.activation == "leaky_relu":
|
||||
rep += ', slope={slope})'
|
||||
else:
|
||||
rep += ')'
|
||||
return rep.format(name=self.__class__.__name__, **self.__dict__)
|
||||
|
||||
|
||||
class InPlaceABN(ABN):
|
||||
"""InPlace Activated Batch Normalization"""
|
||||
|
||||
def __init__(self, num_features, eps=1e-5, momentum=0.1, affine=True, activation="leaky_relu", slope=0.01):
|
||||
"""Creates an InPlace Activated Batch Normalization module
|
||||
|
||||
Parameters
|
||||
----------
|
||||
num_features : int
|
||||
Number of feature channels in the input and output.
|
||||
eps : float
|
||||
Small constant to prevent numerical issues.
|
||||
momentum : float
|
||||
Momentum factor applied to compute running statistics as.
|
||||
affine : bool
|
||||
If `True` apply learned scale and shift transformation after normalization.
|
||||
activation : str
|
||||
Name of the activation functions, one of: `leaky_relu`, `elu` or `none`.
|
||||
slope : float
|
||||
Negative slope for the `leaky_relu` activation.
|
||||
"""
|
||||
super(InPlaceABN, self).__init__(num_features, eps, momentum, affine, activation, slope)
|
||||
|
||||
def forward(self, x):
|
||||
x, _, _ = inplace_abn(x, self.weight, self.bias, self.running_mean, self.running_var,
|
||||
self.training, self.momentum, self.eps, self.activation, self.slope)
|
||||
return x
|
||||
|
||||
|
||||
class InPlaceABNSync(ABN):
|
||||
"""InPlace Activated Batch Normalization with cross-GPU synchronization
|
||||
This assumes that it will be replicated across GPUs using the same mechanism as in `nn.DistributedDataParallel`.
|
||||
"""
|
||||
|
||||
def forward(self, x):
|
||||
x, _, _ = inplace_abn_sync(x, self.weight, self.bias, self.running_mean, self.running_var,
|
||||
self.training, self.momentum, self.eps, self.activation, self.slope)
|
||||
return x
|
||||
|
||||
def __repr__(self):
|
||||
rep = '{name}({num_features}, eps={eps}, momentum={momentum},' \
|
||||
' affine={affine}, activation={activation}'
|
||||
if self.activation == "leaky_relu":
|
||||
rep += ', slope={slope})'
|
||||
else:
|
||||
rep += ')'
|
||||
return rep.format(name=self.__class__.__name__, **self.__dict__)
|
||||
|
||||
|
84
vton-api/preprocess/humanparsing/modules/deeplab.py
Normal file
84
vton-api/preprocess/humanparsing/modules/deeplab.py
Normal file
@@ -0,0 +1,84 @@
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import torch.nn.functional as functional
|
||||
|
||||
from models._util import try_index
|
||||
from .bn import ABN
|
||||
|
||||
|
||||
class DeeplabV3(nn.Module):
|
||||
def __init__(self,
|
||||
in_channels,
|
||||
out_channels,
|
||||
hidden_channels=256,
|
||||
dilations=(12, 24, 36),
|
||||
norm_act=ABN,
|
||||
pooling_size=None):
|
||||
super(DeeplabV3, self).__init__()
|
||||
self.pooling_size = pooling_size
|
||||
|
||||
self.map_convs = nn.ModuleList([
|
||||
nn.Conv2d(in_channels, hidden_channels, 1, bias=False),
|
||||
nn.Conv2d(in_channels, hidden_channels, 3, bias=False, dilation=dilations[0], padding=dilations[0]),
|
||||
nn.Conv2d(in_channels, hidden_channels, 3, bias=False, dilation=dilations[1], padding=dilations[1]),
|
||||
nn.Conv2d(in_channels, hidden_channels, 3, bias=False, dilation=dilations[2], padding=dilations[2])
|
||||
])
|
||||
self.map_bn = norm_act(hidden_channels * 4)
|
||||
|
||||
self.global_pooling_conv = nn.Conv2d(in_channels, hidden_channels, 1, bias=False)
|
||||
self.global_pooling_bn = norm_act(hidden_channels)
|
||||
|
||||
self.red_conv = nn.Conv2d(hidden_channels * 4, out_channels, 1, bias=False)
|
||||
self.pool_red_conv = nn.Conv2d(hidden_channels, out_channels, 1, bias=False)
|
||||
self.red_bn = norm_act(out_channels)
|
||||
|
||||
self.reset_parameters(self.map_bn.activation, self.map_bn.slope)
|
||||
|
||||
def reset_parameters(self, activation, slope):
|
||||
gain = nn.init.calculate_gain(activation, slope)
|
||||
for m in self.modules():
|
||||
if isinstance(m, nn.Conv2d):
|
||||
nn.init.xavier_normal_(m.weight.data, gain)
|
||||
if hasattr(m, "bias") and m.bias is not None:
|
||||
nn.init.constant_(m.bias, 0)
|
||||
elif isinstance(m, ABN):
|
||||
if hasattr(m, "weight") and m.weight is not None:
|
||||
nn.init.constant_(m.weight, 1)
|
||||
if hasattr(m, "bias") and m.bias is not None:
|
||||
nn.init.constant_(m.bias, 0)
|
||||
|
||||
def forward(self, x):
|
||||
# Map convolutions
|
||||
out = torch.cat([m(x) for m in self.map_convs], dim=1)
|
||||
out = self.map_bn(out)
|
||||
out = self.red_conv(out)
|
||||
|
||||
# Global pooling
|
||||
pool = self._global_pooling(x)
|
||||
pool = self.global_pooling_conv(pool)
|
||||
pool = self.global_pooling_bn(pool)
|
||||
pool = self.pool_red_conv(pool)
|
||||
if self.training or self.pooling_size is None:
|
||||
pool = pool.repeat(1, 1, x.size(2), x.size(3))
|
||||
|
||||
out += pool
|
||||
out = self.red_bn(out)
|
||||
return out
|
||||
|
||||
def _global_pooling(self, x):
|
||||
if self.training or self.pooling_size is None:
|
||||
pool = x.view(x.size(0), x.size(1), -1).mean(dim=-1)
|
||||
pool = pool.view(x.size(0), x.size(1), 1, 1)
|
||||
else:
|
||||
pooling_size = (min(try_index(self.pooling_size, 0), x.shape[2]),
|
||||
min(try_index(self.pooling_size, 1), x.shape[3]))
|
||||
padding = (
|
||||
(pooling_size[1] - 1) // 2,
|
||||
(pooling_size[1] - 1) // 2 if pooling_size[1] % 2 == 1 else (pooling_size[1] - 1) // 2 + 1,
|
||||
(pooling_size[0] - 1) // 2,
|
||||
(pooling_size[0] - 1) // 2 if pooling_size[0] % 2 == 1 else (pooling_size[0] - 1) // 2 + 1
|
||||
)
|
||||
|
||||
pool = functional.avg_pool2d(x, pooling_size, stride=1)
|
||||
pool = functional.pad(pool, pad=padding, mode="replicate")
|
||||
return pool
|
42
vton-api/preprocess/humanparsing/modules/dense.py
Normal file
42
vton-api/preprocess/humanparsing/modules/dense.py
Normal file
@@ -0,0 +1,42 @@
|
||||
from collections import OrderedDict
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
|
||||
from .bn import ABN
|
||||
|
||||
|
||||
class DenseModule(nn.Module):
|
||||
def __init__(self, in_channels, growth, layers, bottleneck_factor=4, norm_act=ABN, dilation=1):
|
||||
super(DenseModule, self).__init__()
|
||||
self.in_channels = in_channels
|
||||
self.growth = growth
|
||||
self.layers = layers
|
||||
|
||||
self.convs1 = nn.ModuleList()
|
||||
self.convs3 = nn.ModuleList()
|
||||
for i in range(self.layers):
|
||||
self.convs1.append(nn.Sequential(OrderedDict([
|
||||
("bn", norm_act(in_channels)),
|
||||
("conv", nn.Conv2d(in_channels, self.growth * bottleneck_factor, 1, bias=False))
|
||||
])))
|
||||
self.convs3.append(nn.Sequential(OrderedDict([
|
||||
("bn", norm_act(self.growth * bottleneck_factor)),
|
||||
("conv", nn.Conv2d(self.growth * bottleneck_factor, self.growth, 3, padding=dilation, bias=False,
|
||||
dilation=dilation))
|
||||
])))
|
||||
in_channels += self.growth
|
||||
|
||||
@property
|
||||
def out_channels(self):
|
||||
return self.in_channels + self.growth * self.layers
|
||||
|
||||
def forward(self, x):
|
||||
inputs = [x]
|
||||
for i in range(self.layers):
|
||||
x = torch.cat(inputs, dim=1)
|
||||
x = self.convs1[i](x)
|
||||
x = self.convs3[i](x)
|
||||
inputs += [x]
|
||||
|
||||
return torch.cat(inputs, dim=1)
|
245
vton-api/preprocess/humanparsing/modules/functions.py
Normal file
245
vton-api/preprocess/humanparsing/modules/functions.py
Normal file
@@ -0,0 +1,245 @@
|
||||
import pdb
|
||||
from os import path
|
||||
import torch
|
||||
import torch.distributed as dist
|
||||
import torch.autograd as autograd
|
||||
import torch.cuda.comm as comm
|
||||
from torch.autograd.function import once_differentiable
|
||||
from torch.utils.cpp_extension import load
|
||||
|
||||
_src_path = path.join(path.dirname(path.abspath(__file__)), "src")
|
||||
_backend = load(name="inplace_abn",
|
||||
extra_cflags=["-O3"],
|
||||
sources=[path.join(_src_path, f) for f in [
|
||||
"inplace_abn.cpp",
|
||||
"inplace_abn_cpu.cpp",
|
||||
"inplace_abn_cuda.cu",
|
||||
"inplace_abn_cuda_half.cu"
|
||||
]],
|
||||
extra_cuda_cflags=["--expt-extended-lambda"])
|
||||
|
||||
# Activation names
|
||||
ACT_RELU = "relu"
|
||||
ACT_LEAKY_RELU = "leaky_relu"
|
||||
ACT_ELU = "elu"
|
||||
ACT_NONE = "none"
|
||||
|
||||
|
||||
def _check(fn, *args, **kwargs):
|
||||
success = fn(*args, **kwargs)
|
||||
if not success:
|
||||
raise RuntimeError("CUDA Error encountered in {}".format(fn))
|
||||
|
||||
|
||||
def _broadcast_shape(x):
|
||||
out_size = []
|
||||
for i, s in enumerate(x.size()):
|
||||
if i != 1:
|
||||
out_size.append(1)
|
||||
else:
|
||||
out_size.append(s)
|
||||
return out_size
|
||||
|
||||
|
||||
def _reduce(x):
|
||||
if len(x.size()) == 2:
|
||||
return x.sum(dim=0)
|
||||
else:
|
||||
n, c = x.size()[0:2]
|
||||
return x.contiguous().view((n, c, -1)).sum(2).sum(0)
|
||||
|
||||
|
||||
def _count_samples(x):
|
||||
count = 1
|
||||
for i, s in enumerate(x.size()):
|
||||
if i != 1:
|
||||
count *= s
|
||||
return count
|
||||
|
||||
|
||||
def _act_forward(ctx, x):
|
||||
if ctx.activation == ACT_LEAKY_RELU:
|
||||
_backend.leaky_relu_forward(x, ctx.slope)
|
||||
elif ctx.activation == ACT_ELU:
|
||||
_backend.elu_forward(x)
|
||||
elif ctx.activation == ACT_NONE:
|
||||
pass
|
||||
|
||||
|
||||
def _act_backward(ctx, x, dx):
|
||||
if ctx.activation == ACT_LEAKY_RELU:
|
||||
_backend.leaky_relu_backward(x, dx, ctx.slope)
|
||||
elif ctx.activation == ACT_ELU:
|
||||
_backend.elu_backward(x, dx)
|
||||
elif ctx.activation == ACT_NONE:
|
||||
pass
|
||||
|
||||
|
||||
class InPlaceABN(autograd.Function):
|
||||
@staticmethod
|
||||
def forward(ctx, x, weight, bias, running_mean, running_var,
|
||||
training=True, momentum=0.1, eps=1e-05, activation=ACT_LEAKY_RELU, slope=0.01):
|
||||
# Save context
|
||||
ctx.training = training
|
||||
ctx.momentum = momentum
|
||||
ctx.eps = eps
|
||||
ctx.activation = activation
|
||||
ctx.slope = slope
|
||||
ctx.affine = weight is not None and bias is not None
|
||||
|
||||
# Prepare inputs
|
||||
count = _count_samples(x)
|
||||
x = x.contiguous()
|
||||
weight = weight.contiguous() if ctx.affine else x.new_empty(0)
|
||||
bias = bias.contiguous() if ctx.affine else x.new_empty(0)
|
||||
|
||||
if ctx.training:
|
||||
mean, var = _backend.mean_var(x)
|
||||
|
||||
# Update running stats
|
||||
running_mean.mul_((1 - ctx.momentum)).add_(ctx.momentum * mean)
|
||||
running_var.mul_((1 - ctx.momentum)).add_(ctx.momentum * var * count / (count - 1))
|
||||
|
||||
# Mark in-place modified tensors
|
||||
ctx.mark_dirty(x, running_mean, running_var)
|
||||
else:
|
||||
mean, var = running_mean.contiguous(), running_var.contiguous()
|
||||
ctx.mark_dirty(x)
|
||||
|
||||
# BN forward + activation
|
||||
_backend.forward(x, mean, var, weight, bias, ctx.affine, ctx.eps)
|
||||
_act_forward(ctx, x)
|
||||
|
||||
# Output
|
||||
ctx.var = var
|
||||
ctx.save_for_backward(x, var, weight, bias)
|
||||
ctx.mark_non_differentiable(running_mean, running_var)
|
||||
return x, running_mean, running_var
|
||||
|
||||
@staticmethod
|
||||
@once_differentiable
|
||||
def backward(ctx, dz, _drunning_mean, _drunning_var):
|
||||
z, var, weight, bias = ctx.saved_tensors
|
||||
dz = dz.contiguous()
|
||||
|
||||
# Undo activation
|
||||
_act_backward(ctx, z, dz)
|
||||
|
||||
if ctx.training:
|
||||
edz, eydz = _backend.edz_eydz(z, dz, weight, bias, ctx.affine, ctx.eps)
|
||||
else:
|
||||
# TODO: implement simplified CUDA backward for inference mode
|
||||
edz = dz.new_zeros(dz.size(1))
|
||||
eydz = dz.new_zeros(dz.size(1))
|
||||
|
||||
dx = _backend.backward(z, dz, var, weight, bias, edz, eydz, ctx.affine, ctx.eps)
|
||||
# dweight = eydz * weight.sign() if ctx.affine else None
|
||||
dweight = eydz if ctx.affine else None
|
||||
if dweight is not None:
|
||||
dweight[weight < 0] *= -1
|
||||
dbias = edz if ctx.affine else None
|
||||
|
||||
return dx, dweight, dbias, None, None, None, None, None, None, None
|
||||
|
||||
|
||||
class InPlaceABNSync(autograd.Function):
|
||||
@classmethod
|
||||
def forward(cls, ctx, x, weight, bias, running_mean, running_var,
|
||||
training=True, momentum=0.1, eps=1e-05, activation=ACT_LEAKY_RELU, slope=0.01, equal_batches=True):
|
||||
# Save context
|
||||
ctx.training = training
|
||||
ctx.momentum = momentum
|
||||
ctx.eps = eps
|
||||
ctx.activation = activation
|
||||
ctx.slope = slope
|
||||
ctx.affine = weight is not None and bias is not None
|
||||
|
||||
# Prepare inputs
|
||||
ctx.world_size = dist.get_world_size() if dist.is_initialized() else 1
|
||||
|
||||
# count = _count_samples(x)
|
||||
batch_size = x.new_tensor([x.shape[0]], dtype=torch.long)
|
||||
|
||||
x = x.contiguous()
|
||||
weight = weight.contiguous() if ctx.affine else x.new_empty(0)
|
||||
bias = bias.contiguous() if ctx.affine else x.new_empty(0)
|
||||
|
||||
if ctx.training:
|
||||
mean, var = _backend.mean_var(x)
|
||||
if ctx.world_size > 1:
|
||||
# get global batch size
|
||||
if equal_batches:
|
||||
batch_size *= ctx.world_size
|
||||
else:
|
||||
dist.all_reduce(batch_size, dist.ReduceOp.SUM)
|
||||
|
||||
ctx.factor = x.shape[0] / float(batch_size.item())
|
||||
|
||||
mean_all = mean.clone() * ctx.factor
|
||||
dist.all_reduce(mean_all, dist.ReduceOp.SUM)
|
||||
|
||||
var_all = (var + (mean - mean_all) ** 2) * ctx.factor
|
||||
dist.all_reduce(var_all, dist.ReduceOp.SUM)
|
||||
|
||||
mean = mean_all
|
||||
var = var_all
|
||||
|
||||
# Update running stats
|
||||
running_mean.mul_((1 - ctx.momentum)).add_(ctx.momentum * mean)
|
||||
count = batch_size.item() * x.view(x.shape[0], x.shape[1], -1).shape[-1]
|
||||
running_var.mul_((1 - ctx.momentum)).add_(ctx.momentum * var * (float(count) / (count - 1)))
|
||||
|
||||
# Mark in-place modified tensors
|
||||
ctx.mark_dirty(x, running_mean, running_var)
|
||||
else:
|
||||
mean, var = running_mean.contiguous(), running_var.contiguous()
|
||||
ctx.mark_dirty(x)
|
||||
|
||||
# BN forward + activation
|
||||
_backend.forward(x, mean, var, weight, bias, ctx.affine, ctx.eps)
|
||||
_act_forward(ctx, x)
|
||||
|
||||
# Output
|
||||
ctx.var = var
|
||||
ctx.save_for_backward(x, var, weight, bias)
|
||||
ctx.mark_non_differentiable(running_mean, running_var)
|
||||
return x, running_mean, running_var
|
||||
|
||||
@staticmethod
|
||||
@once_differentiable
|
||||
def backward(ctx, dz, _drunning_mean, _drunning_var):
|
||||
z, var, weight, bias = ctx.saved_tensors
|
||||
dz = dz.contiguous()
|
||||
|
||||
# Undo activation
|
||||
_act_backward(ctx, z, dz)
|
||||
|
||||
if ctx.training:
|
||||
edz, eydz = _backend.edz_eydz(z, dz, weight, bias, ctx.affine, ctx.eps)
|
||||
edz_local = edz.clone()
|
||||
eydz_local = eydz.clone()
|
||||
|
||||
if ctx.world_size > 1:
|
||||
edz *= ctx.factor
|
||||
dist.all_reduce(edz, dist.ReduceOp.SUM)
|
||||
|
||||
eydz *= ctx.factor
|
||||
dist.all_reduce(eydz, dist.ReduceOp.SUM)
|
||||
else:
|
||||
edz_local = edz = dz.new_zeros(dz.size(1))
|
||||
eydz_local = eydz = dz.new_zeros(dz.size(1))
|
||||
|
||||
dx = _backend.backward(z, dz, var, weight, bias, edz, eydz, ctx.affine, ctx.eps)
|
||||
# dweight = eydz_local * weight.sign() if ctx.affine else None
|
||||
dweight = eydz_local if ctx.affine else None
|
||||
if dweight is not None:
|
||||
dweight[weight < 0] *= -1
|
||||
dbias = edz_local if ctx.affine else None
|
||||
|
||||
return dx, dweight, dbias, None, None, None, None, None, None, None
|
||||
|
||||
|
||||
inplace_abn = InPlaceABN.apply
|
||||
inplace_abn_sync = InPlaceABNSync.apply
|
||||
|
||||
__all__ = ["inplace_abn", "inplace_abn_sync", "ACT_RELU", "ACT_LEAKY_RELU", "ACT_ELU", "ACT_NONE"]
|
21
vton-api/preprocess/humanparsing/modules/misc.py
Normal file
21
vton-api/preprocess/humanparsing/modules/misc.py
Normal file
@@ -0,0 +1,21 @@
|
||||
import torch.nn as nn
|
||||
import torch
|
||||
import torch.distributed as dist
|
||||
|
||||
class GlobalAvgPool2d(nn.Module):
|
||||
def __init__(self):
|
||||
"""Global average pooling over the input's spatial dimensions"""
|
||||
super(GlobalAvgPool2d, self).__init__()
|
||||
|
||||
def forward(self, inputs):
|
||||
in_size = inputs.size()
|
||||
return inputs.view((in_size[0], in_size[1], -1)).mean(dim=2)
|
||||
|
||||
class SingleGPU(nn.Module):
|
||||
def __init__(self, module):
|
||||
super(SingleGPU, self).__init__()
|
||||
self.module=module
|
||||
|
||||
def forward(self, input):
|
||||
return self.module(input.cuda(non_blocking=True))
|
||||
|
182
vton-api/preprocess/humanparsing/modules/residual.py
Normal file
182
vton-api/preprocess/humanparsing/modules/residual.py
Normal file
@@ -0,0 +1,182 @@
|
||||
from collections import OrderedDict
|
||||
|
||||
import torch.nn as nn
|
||||
|
||||
from .bn import ABN, ACT_LEAKY_RELU, ACT_ELU, ACT_NONE
|
||||
import torch.nn.functional as functional
|
||||
|
||||
|
||||
class ResidualBlock(nn.Module):
|
||||
"""Configurable residual block
|
||||
|
||||
Parameters
|
||||
----------
|
||||
in_channels : int
|
||||
Number of input channels.
|
||||
channels : list of int
|
||||
Number of channels in the internal feature maps. Can either have two or three elements: if three construct
|
||||
a residual block with two `3 x 3` convolutions, otherwise construct a bottleneck block with `1 x 1`, then
|
||||
`3 x 3` then `1 x 1` convolutions.
|
||||
stride : int
|
||||
Stride of the first `3 x 3` convolution
|
||||
dilation : int
|
||||
Dilation to apply to the `3 x 3` convolutions.
|
||||
groups : int
|
||||
Number of convolution groups. This is used to create ResNeXt-style blocks and is only compatible with
|
||||
bottleneck blocks.
|
||||
norm_act : callable
|
||||
Function to create normalization / activation Module.
|
||||
dropout: callable
|
||||
Function to create Dropout Module.
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
in_channels,
|
||||
channels,
|
||||
stride=1,
|
||||
dilation=1,
|
||||
groups=1,
|
||||
norm_act=ABN,
|
||||
dropout=None):
|
||||
super(ResidualBlock, self).__init__()
|
||||
|
||||
# Check parameters for inconsistencies
|
||||
if len(channels) != 2 and len(channels) != 3:
|
||||
raise ValueError("channels must contain either two or three values")
|
||||
if len(channels) == 2 and groups != 1:
|
||||
raise ValueError("groups > 1 are only valid if len(channels) == 3")
|
||||
|
||||
is_bottleneck = len(channels) == 3
|
||||
need_proj_conv = stride != 1 or in_channels != channels[-1]
|
||||
|
||||
if not is_bottleneck:
|
||||
bn2 = norm_act(channels[1])
|
||||
bn2.activation = ACT_NONE
|
||||
layers = [
|
||||
("conv1", nn.Conv2d(in_channels, channels[0], 3, stride=stride, padding=dilation, bias=False,
|
||||
dilation=dilation)),
|
||||
("bn1", norm_act(channels[0])),
|
||||
("conv2", nn.Conv2d(channels[0], channels[1], 3, stride=1, padding=dilation, bias=False,
|
||||
dilation=dilation)),
|
||||
("bn2", bn2)
|
||||
]
|
||||
if dropout is not None:
|
||||
layers = layers[0:2] + [("dropout", dropout())] + layers[2:]
|
||||
else:
|
||||
bn3 = norm_act(channels[2])
|
||||
bn3.activation = ACT_NONE
|
||||
layers = [
|
||||
("conv1", nn.Conv2d(in_channels, channels[0], 1, stride=1, padding=0, bias=False)),
|
||||
("bn1", norm_act(channels[0])),
|
||||
("conv2", nn.Conv2d(channels[0], channels[1], 3, stride=stride, padding=dilation, bias=False,
|
||||
groups=groups, dilation=dilation)),
|
||||
("bn2", norm_act(channels[1])),
|
||||
("conv3", nn.Conv2d(channels[1], channels[2], 1, stride=1, padding=0, bias=False)),
|
||||
("bn3", bn3)
|
||||
]
|
||||
if dropout is not None:
|
||||
layers = layers[0:4] + [("dropout", dropout())] + layers[4:]
|
||||
self.convs = nn.Sequential(OrderedDict(layers))
|
||||
|
||||
if need_proj_conv:
|
||||
self.proj_conv = nn.Conv2d(in_channels, channels[-1], 1, stride=stride, padding=0, bias=False)
|
||||
self.proj_bn = norm_act(channels[-1])
|
||||
self.proj_bn.activation = ACT_NONE
|
||||
|
||||
def forward(self, x):
|
||||
if hasattr(self, "proj_conv"):
|
||||
residual = self.proj_conv(x)
|
||||
residual = self.proj_bn(residual)
|
||||
else:
|
||||
residual = x
|
||||
x = self.convs(x) + residual
|
||||
|
||||
if self.convs.bn1.activation == ACT_LEAKY_RELU:
|
||||
return functional.leaky_relu(x, negative_slope=self.convs.bn1.slope, inplace=True)
|
||||
elif self.convs.bn1.activation == ACT_ELU:
|
||||
return functional.elu(x, inplace=True)
|
||||
else:
|
||||
return x
|
||||
|
||||
|
||||
class IdentityResidualBlock(nn.Module):
|
||||
def __init__(self,
|
||||
in_channels,
|
||||
channels,
|
||||
stride=1,
|
||||
dilation=1,
|
||||
groups=1,
|
||||
norm_act=ABN,
|
||||
dropout=None):
|
||||
"""Configurable identity-mapping residual block
|
||||
|
||||
Parameters
|
||||
----------
|
||||
in_channels : int
|
||||
Number of input channels.
|
||||
channels : list of int
|
||||
Number of channels in the internal feature maps. Can either have two or three elements: if three construct
|
||||
a residual block with two `3 x 3` convolutions, otherwise construct a bottleneck block with `1 x 1`, then
|
||||
`3 x 3` then `1 x 1` convolutions.
|
||||
stride : int
|
||||
Stride of the first `3 x 3` convolution
|
||||
dilation : int
|
||||
Dilation to apply to the `3 x 3` convolutions.
|
||||
groups : int
|
||||
Number of convolution groups. This is used to create ResNeXt-style blocks and is only compatible with
|
||||
bottleneck blocks.
|
||||
norm_act : callable
|
||||
Function to create normalization / activation Module.
|
||||
dropout: callable
|
||||
Function to create Dropout Module.
|
||||
"""
|
||||
super(IdentityResidualBlock, self).__init__()
|
||||
|
||||
# Check parameters for inconsistencies
|
||||
if len(channels) != 2 and len(channels) != 3:
|
||||
raise ValueError("channels must contain either two or three values")
|
||||
if len(channels) == 2 and groups != 1:
|
||||
raise ValueError("groups > 1 are only valid if len(channels) == 3")
|
||||
|
||||
is_bottleneck = len(channels) == 3
|
||||
need_proj_conv = stride != 1 or in_channels != channels[-1]
|
||||
|
||||
self.bn1 = norm_act(in_channels)
|
||||
if not is_bottleneck:
|
||||
layers = [
|
||||
("conv1", nn.Conv2d(in_channels, channels[0], 3, stride=stride, padding=dilation, bias=False,
|
||||
dilation=dilation)),
|
||||
("bn2", norm_act(channels[0])),
|
||||
("conv2", nn.Conv2d(channels[0], channels[1], 3, stride=1, padding=dilation, bias=False,
|
||||
dilation=dilation))
|
||||
]
|
||||
if dropout is not None:
|
||||
layers = layers[0:2] + [("dropout", dropout())] + layers[2:]
|
||||
else:
|
||||
layers = [
|
||||
("conv1", nn.Conv2d(in_channels, channels[0], 1, stride=stride, padding=0, bias=False)),
|
||||
("bn2", norm_act(channels[0])),
|
||||
("conv2", nn.Conv2d(channels[0], channels[1], 3, stride=1, padding=dilation, bias=False,
|
||||
groups=groups, dilation=dilation)),
|
||||
("bn3", norm_act(channels[1])),
|
||||
("conv3", nn.Conv2d(channels[1], channels[2], 1, stride=1, padding=0, bias=False))
|
||||
]
|
||||
if dropout is not None:
|
||||
layers = layers[0:4] + [("dropout", dropout())] + layers[4:]
|
||||
self.convs = nn.Sequential(OrderedDict(layers))
|
||||
|
||||
if need_proj_conv:
|
||||
self.proj_conv = nn.Conv2d(in_channels, channels[-1], 1, stride=stride, padding=0, bias=False)
|
||||
|
||||
def forward(self, x):
|
||||
if hasattr(self, "proj_conv"):
|
||||
bn1 = self.bn1(x)
|
||||
shortcut = self.proj_conv(bn1)
|
||||
else:
|
||||
shortcut = x.clone()
|
||||
bn1 = self.bn1(x)
|
||||
|
||||
out = self.convs(bn1)
|
||||
out.add_(shortcut)
|
||||
|
||||
return out
|
15
vton-api/preprocess/humanparsing/modules/src/checks.h
Normal file
15
vton-api/preprocess/humanparsing/modules/src/checks.h
Normal file
@@ -0,0 +1,15 @@
|
||||
#pragma once
|
||||
|
||||
#include <ATen/ATen.h>
|
||||
|
||||
// Define AT_CHECK for old version of ATen where the same function was called AT_ASSERT
|
||||
#ifndef AT_CHECK
|
||||
#define AT_CHECK AT_ASSERT
|
||||
#endif
|
||||
|
||||
#define CHECK_CUDA(x) AT_CHECK((x).type().is_cuda(), #x " must be a CUDA tensor")
|
||||
#define CHECK_CPU(x) AT_CHECK(!(x).type().is_cuda(), #x " must be a CPU tensor")
|
||||
#define CHECK_CONTIGUOUS(x) AT_CHECK((x).is_contiguous(), #x " must be contiguous")
|
||||
|
||||
#define CHECK_CUDA_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x)
|
||||
#define CHECK_CPU_INPUT(x) CHECK_CPU(x); CHECK_CONTIGUOUS(x)
|
95
vton-api/preprocess/humanparsing/modules/src/inplace_abn.cpp
Normal file
95
vton-api/preprocess/humanparsing/modules/src/inplace_abn.cpp
Normal file
@@ -0,0 +1,95 @@
|
||||
#include <torch/extension.h>
|
||||
|
||||
#include <vector>
|
||||
|
||||
#include "inplace_abn.h"
|
||||
|
||||
std::vector<at::Tensor> mean_var(at::Tensor x) {
|
||||
if (x.is_cuda()) {
|
||||
if (x.type().scalarType() == at::ScalarType::Half) {
|
||||
return mean_var_cuda_h(x);
|
||||
} else {
|
||||
return mean_var_cuda(x);
|
||||
}
|
||||
} else {
|
||||
return mean_var_cpu(x);
|
||||
}
|
||||
}
|
||||
|
||||
at::Tensor forward(at::Tensor x, at::Tensor mean, at::Tensor var, at::Tensor weight, at::Tensor bias,
|
||||
bool affine, float eps) {
|
||||
if (x.is_cuda()) {
|
||||
if (x.type().scalarType() == at::ScalarType::Half) {
|
||||
return forward_cuda_h(x, mean, var, weight, bias, affine, eps);
|
||||
} else {
|
||||
return forward_cuda(x, mean, var, weight, bias, affine, eps);
|
||||
}
|
||||
} else {
|
||||
return forward_cpu(x, mean, var, weight, bias, affine, eps);
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<at::Tensor> edz_eydz(at::Tensor z, at::Tensor dz, at::Tensor weight, at::Tensor bias,
|
||||
bool affine, float eps) {
|
||||
if (z.is_cuda()) {
|
||||
if (z.type().scalarType() == at::ScalarType::Half) {
|
||||
return edz_eydz_cuda_h(z, dz, weight, bias, affine, eps);
|
||||
} else {
|
||||
return edz_eydz_cuda(z, dz, weight, bias, affine, eps);
|
||||
}
|
||||
} else {
|
||||
return edz_eydz_cpu(z, dz, weight, bias, affine, eps);
|
||||
}
|
||||
}
|
||||
|
||||
at::Tensor backward(at::Tensor z, at::Tensor dz, at::Tensor var, at::Tensor weight, at::Tensor bias,
|
||||
at::Tensor edz, at::Tensor eydz, bool affine, float eps) {
|
||||
if (z.is_cuda()) {
|
||||
if (z.type().scalarType() == at::ScalarType::Half) {
|
||||
return backward_cuda_h(z, dz, var, weight, bias, edz, eydz, affine, eps);
|
||||
} else {
|
||||
return backward_cuda(z, dz, var, weight, bias, edz, eydz, affine, eps);
|
||||
}
|
||||
} else {
|
||||
return backward_cpu(z, dz, var, weight, bias, edz, eydz, affine, eps);
|
||||
}
|
||||
}
|
||||
|
||||
void leaky_relu_forward(at::Tensor z, float slope) {
|
||||
at::leaky_relu_(z, slope);
|
||||
}
|
||||
|
||||
void leaky_relu_backward(at::Tensor z, at::Tensor dz, float slope) {
|
||||
if (z.is_cuda()) {
|
||||
if (z.type().scalarType() == at::ScalarType::Half) {
|
||||
return leaky_relu_backward_cuda_h(z, dz, slope);
|
||||
} else {
|
||||
return leaky_relu_backward_cuda(z, dz, slope);
|
||||
}
|
||||
} else {
|
||||
return leaky_relu_backward_cpu(z, dz, slope);
|
||||
}
|
||||
}
|
||||
|
||||
void elu_forward(at::Tensor z) {
|
||||
at::elu_(z);
|
||||
}
|
||||
|
||||
void elu_backward(at::Tensor z, at::Tensor dz) {
|
||||
if (z.is_cuda()) {
|
||||
return elu_backward_cuda(z, dz);
|
||||
} else {
|
||||
return elu_backward_cpu(z, dz);
|
||||
}
|
||||
}
|
||||
|
||||
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
|
||||
m.def("mean_var", &mean_var, "Mean and variance computation");
|
||||
m.def("forward", &forward, "In-place forward computation");
|
||||
m.def("edz_eydz", &edz_eydz, "First part of backward computation");
|
||||
m.def("backward", &backward, "Second part of backward computation");
|
||||
m.def("leaky_relu_forward", &leaky_relu_forward, "Leaky relu forward computation");
|
||||
m.def("leaky_relu_backward", &leaky_relu_backward, "Leaky relu backward computation and inversion");
|
||||
m.def("elu_forward", &elu_forward, "Elu forward computation");
|
||||
m.def("elu_backward", &elu_backward, "Elu backward computation and inversion");
|
||||
}
|
88
vton-api/preprocess/humanparsing/modules/src/inplace_abn.h
Normal file
88
vton-api/preprocess/humanparsing/modules/src/inplace_abn.h
Normal file
@@ -0,0 +1,88 @@
|
||||
#pragma once
|
||||
|
||||
#include <ATen/ATen.h>
|
||||
|
||||
#include <vector>
|
||||
|
||||
std::vector<at::Tensor> mean_var_cpu(at::Tensor x);
|
||||
std::vector<at::Tensor> mean_var_cuda(at::Tensor x);
|
||||
std::vector<at::Tensor> mean_var_cuda_h(at::Tensor x);
|
||||
|
||||
at::Tensor forward_cpu(at::Tensor x, at::Tensor mean, at::Tensor var, at::Tensor weight, at::Tensor bias,
|
||||
bool affine, float eps);
|
||||
at::Tensor forward_cuda(at::Tensor x, at::Tensor mean, at::Tensor var, at::Tensor weight, at::Tensor bias,
|
||||
bool affine, float eps);
|
||||
at::Tensor forward_cuda_h(at::Tensor x, at::Tensor mean, at::Tensor var, at::Tensor weight, at::Tensor bias,
|
||||
bool affine, float eps);
|
||||
|
||||
std::vector<at::Tensor> edz_eydz_cpu(at::Tensor z, at::Tensor dz, at::Tensor weight, at::Tensor bias,
|
||||
bool affine, float eps);
|
||||
std::vector<at::Tensor> edz_eydz_cuda(at::Tensor z, at::Tensor dz, at::Tensor weight, at::Tensor bias,
|
||||
bool affine, float eps);
|
||||
std::vector<at::Tensor> edz_eydz_cuda_h(at::Tensor z, at::Tensor dz, at::Tensor weight, at::Tensor bias,
|
||||
bool affine, float eps);
|
||||
|
||||
at::Tensor backward_cpu(at::Tensor z, at::Tensor dz, at::Tensor var, at::Tensor weight, at::Tensor bias,
|
||||
at::Tensor edz, at::Tensor eydz, bool affine, float eps);
|
||||
at::Tensor backward_cuda(at::Tensor z, at::Tensor dz, at::Tensor var, at::Tensor weight, at::Tensor bias,
|
||||
at::Tensor edz, at::Tensor eydz, bool affine, float eps);
|
||||
at::Tensor backward_cuda_h(at::Tensor z, at::Tensor dz, at::Tensor var, at::Tensor weight, at::Tensor bias,
|
||||
at::Tensor edz, at::Tensor eydz, bool affine, float eps);
|
||||
|
||||
void leaky_relu_backward_cpu(at::Tensor z, at::Tensor dz, float slope);
|
||||
void leaky_relu_backward_cuda(at::Tensor z, at::Tensor dz, float slope);
|
||||
void leaky_relu_backward_cuda_h(at::Tensor z, at::Tensor dz, float slope);
|
||||
|
||||
void elu_backward_cpu(at::Tensor z, at::Tensor dz);
|
||||
void elu_backward_cuda(at::Tensor z, at::Tensor dz);
|
||||
|
||||
static void get_dims(at::Tensor x, int64_t& num, int64_t& chn, int64_t& sp) {
|
||||
num = x.size(0);
|
||||
chn = x.size(1);
|
||||
sp = 1;
|
||||
for (int64_t i = 2; i < x.ndimension(); ++i)
|
||||
sp *= x.size(i);
|
||||
}
|
||||
|
||||
/*
|
||||
* Specialized CUDA reduction functions for BN
|
||||
*/
|
||||
#ifdef __CUDACC__
|
||||
|
||||
#include "utils/cuda.cuh"
|
||||
|
||||
template <typename T, typename Op>
|
||||
__device__ T reduce(Op op, int plane, int N, int S) {
|
||||
T sum = (T)0;
|
||||
for (int batch = 0; batch < N; ++batch) {
|
||||
for (int x = threadIdx.x; x < S; x += blockDim.x) {
|
||||
sum += op(batch, plane, x);
|
||||
}
|
||||
}
|
||||
|
||||
// sum over NumThreads within a warp
|
||||
sum = warpSum(sum);
|
||||
|
||||
// 'transpose', and reduce within warp again
|
||||
__shared__ T shared[32];
|
||||
__syncthreads();
|
||||
if (threadIdx.x % WARP_SIZE == 0) {
|
||||
shared[threadIdx.x / WARP_SIZE] = sum;
|
||||
}
|
||||
if (threadIdx.x >= blockDim.x / WARP_SIZE && threadIdx.x < WARP_SIZE) {
|
||||
// zero out the other entries in shared
|
||||
shared[threadIdx.x] = (T)0;
|
||||
}
|
||||
__syncthreads();
|
||||
if (threadIdx.x / WARP_SIZE == 0) {
|
||||
sum = warpSum(shared[threadIdx.x]);
|
||||
if (threadIdx.x == 0) {
|
||||
shared[0] = sum;
|
||||
}
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
// Everyone picks it up, should be broadcast into the whole gradInput
|
||||
return shared[0];
|
||||
}
|
||||
#endif
|
119
vton-api/preprocess/humanparsing/modules/src/inplace_abn_cpu.cpp
Normal file
119
vton-api/preprocess/humanparsing/modules/src/inplace_abn_cpu.cpp
Normal file
@@ -0,0 +1,119 @@
|
||||
#include <ATen/ATen.h>
|
||||
|
||||
#include <vector>
|
||||
|
||||
#include "utils/checks.h"
|
||||
#include "inplace_abn.h"
|
||||
|
||||
at::Tensor reduce_sum(at::Tensor x) {
|
||||
if (x.ndimension() == 2) {
|
||||
return x.sum(0);
|
||||
} else {
|
||||
auto x_view = x.view({x.size(0), x.size(1), -1});
|
||||
return x_view.sum(-1).sum(0);
|
||||
}
|
||||
}
|
||||
|
||||
at::Tensor broadcast_to(at::Tensor v, at::Tensor x) {
|
||||
if (x.ndimension() == 2) {
|
||||
return v;
|
||||
} else {
|
||||
std::vector<int64_t> broadcast_size = {1, -1};
|
||||
for (int64_t i = 2; i < x.ndimension(); ++i)
|
||||
broadcast_size.push_back(1);
|
||||
|
||||
return v.view(broadcast_size);
|
||||
}
|
||||
}
|
||||
|
||||
int64_t count(at::Tensor x) {
|
||||
int64_t count = x.size(0);
|
||||
for (int64_t i = 2; i < x.ndimension(); ++i)
|
||||
count *= x.size(i);
|
||||
|
||||
return count;
|
||||
}
|
||||
|
||||
at::Tensor invert_affine(at::Tensor z, at::Tensor weight, at::Tensor bias, bool affine, float eps) {
|
||||
if (affine) {
|
||||
return (z - broadcast_to(bias, z)) / broadcast_to(at::abs(weight) + eps, z);
|
||||
} else {
|
||||
return z;
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<at::Tensor> mean_var_cpu(at::Tensor x) {
|
||||
auto num = count(x);
|
||||
auto mean = reduce_sum(x) / num;
|
||||
auto diff = x - broadcast_to(mean, x);
|
||||
auto var = reduce_sum(diff.pow(2)) / num;
|
||||
|
||||
return {mean, var};
|
||||
}
|
||||
|
||||
at::Tensor forward_cpu(at::Tensor x, at::Tensor mean, at::Tensor var, at::Tensor weight, at::Tensor bias,
|
||||
bool affine, float eps) {
|
||||
auto gamma = affine ? at::abs(weight) + eps : at::ones_like(var);
|
||||
auto mul = at::rsqrt(var + eps) * gamma;
|
||||
|
||||
x.sub_(broadcast_to(mean, x));
|
||||
x.mul_(broadcast_to(mul, x));
|
||||
if (affine) x.add_(broadcast_to(bias, x));
|
||||
|
||||
return x;
|
||||
}
|
||||
|
||||
std::vector<at::Tensor> edz_eydz_cpu(at::Tensor z, at::Tensor dz, at::Tensor weight, at::Tensor bias,
|
||||
bool affine, float eps) {
|
||||
auto edz = reduce_sum(dz);
|
||||
auto y = invert_affine(z, weight, bias, affine, eps);
|
||||
auto eydz = reduce_sum(y * dz);
|
||||
|
||||
return {edz, eydz};
|
||||
}
|
||||
|
||||
at::Tensor backward_cpu(at::Tensor z, at::Tensor dz, at::Tensor var, at::Tensor weight, at::Tensor bias,
|
||||
at::Tensor edz, at::Tensor eydz, bool affine, float eps) {
|
||||
auto y = invert_affine(z, weight, bias, affine, eps);
|
||||
auto mul = affine ? at::rsqrt(var + eps) * (at::abs(weight) + eps) : at::rsqrt(var + eps);
|
||||
|
||||
auto num = count(z);
|
||||
auto dx = (dz - broadcast_to(edz / num, dz) - y * broadcast_to(eydz / num, dz)) * broadcast_to(mul, dz);
|
||||
return dx;
|
||||
}
|
||||
|
||||
void leaky_relu_backward_cpu(at::Tensor z, at::Tensor dz, float slope) {
|
||||
CHECK_CPU_INPUT(z);
|
||||
CHECK_CPU_INPUT(dz);
|
||||
|
||||
AT_DISPATCH_FLOATING_TYPES(z.type(), "leaky_relu_backward_cpu", ([&] {
|
||||
int64_t count = z.numel();
|
||||
auto *_z = z.data<scalar_t>();
|
||||
auto *_dz = dz.data<scalar_t>();
|
||||
|
||||
for (int64_t i = 0; i < count; ++i) {
|
||||
if (_z[i] < 0) {
|
||||
_z[i] *= 1 / slope;
|
||||
_dz[i] *= slope;
|
||||
}
|
||||
}
|
||||
}));
|
||||
}
|
||||
|
||||
void elu_backward_cpu(at::Tensor z, at::Tensor dz) {
|
||||
CHECK_CPU_INPUT(z);
|
||||
CHECK_CPU_INPUT(dz);
|
||||
|
||||
AT_DISPATCH_FLOATING_TYPES(z.type(), "elu_backward_cpu", ([&] {
|
||||
int64_t count = z.numel();
|
||||
auto *_z = z.data<scalar_t>();
|
||||
auto *_dz = dz.data<scalar_t>();
|
||||
|
||||
for (int64_t i = 0; i < count; ++i) {
|
||||
if (_z[i] < 0) {
|
||||
_z[i] = log1p(_z[i]);
|
||||
_dz[i] *= (_z[i] + 1.f);
|
||||
}
|
||||
}
|
||||
}));
|
||||
}
|
333
vton-api/preprocess/humanparsing/modules/src/inplace_abn_cuda.cu
Normal file
333
vton-api/preprocess/humanparsing/modules/src/inplace_abn_cuda.cu
Normal file
@@ -0,0 +1,333 @@
|
||||
#include <ATen/ATen.h>
|
||||
|
||||
#include <thrust/device_ptr.h>
|
||||
#include <thrust/transform.h>
|
||||
|
||||
#include <vector>
|
||||
|
||||
#include "utils/checks.h"
|
||||
#include "utils/cuda.cuh"
|
||||
#include "inplace_abn.h"
|
||||
|
||||
#include <ATen/cuda/CUDAContext.h>
|
||||
|
||||
// Operations for reduce
|
||||
template<typename T>
|
||||
struct SumOp {
|
||||
__device__ SumOp(const T *t, int c, int s)
|
||||
: tensor(t), chn(c), sp(s) {}
|
||||
__device__ __forceinline__ T operator()(int batch, int plane, int n) {
|
||||
return tensor[(batch * chn + plane) * sp + n];
|
||||
}
|
||||
const T *tensor;
|
||||
const int chn;
|
||||
const int sp;
|
||||
};
|
||||
|
||||
template<typename T>
|
||||
struct VarOp {
|
||||
__device__ VarOp(T m, const T *t, int c, int s)
|
||||
: mean(m), tensor(t), chn(c), sp(s) {}
|
||||
__device__ __forceinline__ T operator()(int batch, int plane, int n) {
|
||||
T val = tensor[(batch * chn + plane) * sp + n];
|
||||
return (val - mean) * (val - mean);
|
||||
}
|
||||
const T mean;
|
||||
const T *tensor;
|
||||
const int chn;
|
||||
const int sp;
|
||||
};
|
||||
|
||||
template<typename T>
|
||||
struct GradOp {
|
||||
__device__ GradOp(T _weight, T _bias, const T *_z, const T *_dz, int c, int s)
|
||||
: weight(_weight), bias(_bias), z(_z), dz(_dz), chn(c), sp(s) {}
|
||||
__device__ __forceinline__ Pair<T> operator()(int batch, int plane, int n) {
|
||||
T _y = (z[(batch * chn + plane) * sp + n] - bias) / weight;
|
||||
T _dz = dz[(batch * chn + plane) * sp + n];
|
||||
return Pair<T>(_dz, _y * _dz);
|
||||
}
|
||||
const T weight;
|
||||
const T bias;
|
||||
const T *z;
|
||||
const T *dz;
|
||||
const int chn;
|
||||
const int sp;
|
||||
};
|
||||
|
||||
/***********
|
||||
* mean_var
|
||||
***********/
|
||||
|
||||
template<typename T>
|
||||
__global__ void mean_var_kernel(const T *x, T *mean, T *var, int num, int chn, int sp) {
|
||||
int plane = blockIdx.x;
|
||||
T norm = T(1) / T(num * sp);
|
||||
|
||||
T _mean = reduce<T, SumOp<T>>(SumOp<T>(x, chn, sp), plane, num, sp) * norm;
|
||||
__syncthreads();
|
||||
T _var = reduce<T, VarOp<T>>(VarOp<T>(_mean, x, chn, sp), plane, num, sp) * norm;
|
||||
|
||||
if (threadIdx.x == 0) {
|
||||
mean[plane] = _mean;
|
||||
var[plane] = _var;
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<at::Tensor> mean_var_cuda(at::Tensor x) {
|
||||
CHECK_CUDA_INPUT(x);
|
||||
|
||||
// Extract dimensions
|
||||
int64_t num, chn, sp;
|
||||
get_dims(x, num, chn, sp);
|
||||
|
||||
// Prepare output tensors
|
||||
auto mean = at::empty({chn}, x.options());
|
||||
auto var = at::empty({chn}, x.options());
|
||||
|
||||
// Run kernel
|
||||
dim3 blocks(chn);
|
||||
dim3 threads(getNumThreads(sp));
|
||||
auto stream = at::cuda::getCurrentCUDAStream();
|
||||
AT_DISPATCH_FLOATING_TYPES(x.type(), "mean_var_cuda", ([&] {
|
||||
mean_var_kernel<scalar_t><<<blocks, threads, 0, stream>>>(
|
||||
x.data<scalar_t>(),
|
||||
mean.data<scalar_t>(),
|
||||
var.data<scalar_t>(),
|
||||
num, chn, sp);
|
||||
}));
|
||||
|
||||
return {mean, var};
|
||||
}
|
||||
|
||||
/**********
|
||||
* forward
|
||||
**********/
|
||||
|
||||
template<typename T>
|
||||
__global__ void forward_kernel(T *x, const T *mean, const T *var, const T *weight, const T *bias,
|
||||
bool affine, float eps, int num, int chn, int sp) {
|
||||
int plane = blockIdx.x;
|
||||
|
||||
T _mean = mean[plane];
|
||||
T _var = var[plane];
|
||||
T _weight = affine ? abs(weight[plane]) + eps : T(1);
|
||||
T _bias = affine ? bias[plane] : T(0);
|
||||
|
||||
T mul = rsqrt(_var + eps) * _weight;
|
||||
|
||||
for (int batch = 0; batch < num; ++batch) {
|
||||
for (int n = threadIdx.x; n < sp; n += blockDim.x) {
|
||||
T _x = x[(batch * chn + plane) * sp + n];
|
||||
T _y = (_x - _mean) * mul + _bias;
|
||||
|
||||
x[(batch * chn + plane) * sp + n] = _y;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
at::Tensor forward_cuda(at::Tensor x, at::Tensor mean, at::Tensor var, at::Tensor weight, at::Tensor bias,
|
||||
bool affine, float eps) {
|
||||
CHECK_CUDA_INPUT(x);
|
||||
CHECK_CUDA_INPUT(mean);
|
||||
CHECK_CUDA_INPUT(var);
|
||||
CHECK_CUDA_INPUT(weight);
|
||||
CHECK_CUDA_INPUT(bias);
|
||||
|
||||
// Extract dimensions
|
||||
int64_t num, chn, sp;
|
||||
get_dims(x, num, chn, sp);
|
||||
|
||||
// Run kernel
|
||||
dim3 blocks(chn);
|
||||
dim3 threads(getNumThreads(sp));
|
||||
auto stream = at::cuda::getCurrentCUDAStream();
|
||||
AT_DISPATCH_FLOATING_TYPES(x.type(), "forward_cuda", ([&] {
|
||||
forward_kernel<scalar_t><<<blocks, threads, 0, stream>>>(
|
||||
x.data<scalar_t>(),
|
||||
mean.data<scalar_t>(),
|
||||
var.data<scalar_t>(),
|
||||
weight.data<scalar_t>(),
|
||||
bias.data<scalar_t>(),
|
||||
affine, eps, num, chn, sp);
|
||||
}));
|
||||
|
||||
return x;
|
||||
}
|
||||
|
||||
/***********
|
||||
* edz_eydz
|
||||
***********/
|
||||
|
||||
template<typename T>
|
||||
__global__ void edz_eydz_kernel(const T *z, const T *dz, const T *weight, const T *bias,
|
||||
T *edz, T *eydz, bool affine, float eps, int num, int chn, int sp) {
|
||||
int plane = blockIdx.x;
|
||||
|
||||
T _weight = affine ? abs(weight[plane]) + eps : 1.f;
|
||||
T _bias = affine ? bias[plane] : 0.f;
|
||||
|
||||
Pair<T> res = reduce<Pair<T>, GradOp<T>>(GradOp<T>(_weight, _bias, z, dz, chn, sp), plane, num, sp);
|
||||
__syncthreads();
|
||||
|
||||
if (threadIdx.x == 0) {
|
||||
edz[plane] = res.v1;
|
||||
eydz[plane] = res.v2;
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<at::Tensor> edz_eydz_cuda(at::Tensor z, at::Tensor dz, at::Tensor weight, at::Tensor bias,
|
||||
bool affine, float eps) {
|
||||
CHECK_CUDA_INPUT(z);
|
||||
CHECK_CUDA_INPUT(dz);
|
||||
CHECK_CUDA_INPUT(weight);
|
||||
CHECK_CUDA_INPUT(bias);
|
||||
|
||||
// Extract dimensions
|
||||
int64_t num, chn, sp;
|
||||
get_dims(z, num, chn, sp);
|
||||
|
||||
auto edz = at::empty({chn}, z.options());
|
||||
auto eydz = at::empty({chn}, z.options());
|
||||
|
||||
// Run kernel
|
||||
dim3 blocks(chn);
|
||||
dim3 threads(getNumThreads(sp));
|
||||
auto stream = at::cuda::getCurrentCUDAStream();
|
||||
AT_DISPATCH_FLOATING_TYPES(z.type(), "edz_eydz_cuda", ([&] {
|
||||
edz_eydz_kernel<scalar_t><<<blocks, threads, 0, stream>>>(
|
||||
z.data<scalar_t>(),
|
||||
dz.data<scalar_t>(),
|
||||
weight.data<scalar_t>(),
|
||||
bias.data<scalar_t>(),
|
||||
edz.data<scalar_t>(),
|
||||
eydz.data<scalar_t>(),
|
||||
affine, eps, num, chn, sp);
|
||||
}));
|
||||
|
||||
return {edz, eydz};
|
||||
}
|
||||
|
||||
/***********
|
||||
* backward
|
||||
***********/
|
||||
|
||||
template<typename T>
|
||||
__global__ void backward_kernel(const T *z, const T *dz, const T *var, const T *weight, const T *bias, const T *edz,
|
||||
const T *eydz, T *dx, bool affine, float eps, int num, int chn, int sp) {
|
||||
int plane = blockIdx.x;
|
||||
|
||||
T _weight = affine ? abs(weight[plane]) + eps : 1.f;
|
||||
T _bias = affine ? bias[plane] : 0.f;
|
||||
T _var = var[plane];
|
||||
T _edz = edz[plane];
|
||||
T _eydz = eydz[plane];
|
||||
|
||||
T _mul = _weight * rsqrt(_var + eps);
|
||||
T count = T(num * sp);
|
||||
|
||||
for (int batch = 0; batch < num; ++batch) {
|
||||
for (int n = threadIdx.x; n < sp; n += blockDim.x) {
|
||||
T _dz = dz[(batch * chn + plane) * sp + n];
|
||||
T _y = (z[(batch * chn + plane) * sp + n] - _bias) / _weight;
|
||||
|
||||
dx[(batch * chn + plane) * sp + n] = (_dz - _edz / count - _y * _eydz / count) * _mul;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
at::Tensor backward_cuda(at::Tensor z, at::Tensor dz, at::Tensor var, at::Tensor weight, at::Tensor bias,
|
||||
at::Tensor edz, at::Tensor eydz, bool affine, float eps) {
|
||||
CHECK_CUDA_INPUT(z);
|
||||
CHECK_CUDA_INPUT(dz);
|
||||
CHECK_CUDA_INPUT(var);
|
||||
CHECK_CUDA_INPUT(weight);
|
||||
CHECK_CUDA_INPUT(bias);
|
||||
CHECK_CUDA_INPUT(edz);
|
||||
CHECK_CUDA_INPUT(eydz);
|
||||
|
||||
// Extract dimensions
|
||||
int64_t num, chn, sp;
|
||||
get_dims(z, num, chn, sp);
|
||||
|
||||
auto dx = at::zeros_like(z);
|
||||
|
||||
// Run kernel
|
||||
dim3 blocks(chn);
|
||||
dim3 threads(getNumThreads(sp));
|
||||
auto stream = at::cuda::getCurrentCUDAStream();
|
||||
AT_DISPATCH_FLOATING_TYPES(z.type(), "backward_cuda", ([&] {
|
||||
backward_kernel<scalar_t><<<blocks, threads, 0, stream>>>(
|
||||
z.data<scalar_t>(),
|
||||
dz.data<scalar_t>(),
|
||||
var.data<scalar_t>(),
|
||||
weight.data<scalar_t>(),
|
||||
bias.data<scalar_t>(),
|
||||
edz.data<scalar_t>(),
|
||||
eydz.data<scalar_t>(),
|
||||
dx.data<scalar_t>(),
|
||||
affine, eps, num, chn, sp);
|
||||
}));
|
||||
|
||||
return dx;
|
||||
}
|
||||
|
||||
/**************
|
||||
* activations
|
||||
**************/
|
||||
|
||||
template<typename T>
|
||||
inline void leaky_relu_backward_impl(T *z, T *dz, float slope, int64_t count) {
|
||||
// Create thrust pointers
|
||||
thrust::device_ptr<T> th_z = thrust::device_pointer_cast(z);
|
||||
thrust::device_ptr<T> th_dz = thrust::device_pointer_cast(dz);
|
||||
|
||||
auto stream = at::cuda::getCurrentCUDAStream();
|
||||
thrust::transform_if(thrust::cuda::par.on(stream),
|
||||
th_dz, th_dz + count, th_z, th_dz,
|
||||
[slope] __device__ (const T& dz) { return dz * slope; },
|
||||
[] __device__ (const T& z) { return z < 0; });
|
||||
thrust::transform_if(thrust::cuda::par.on(stream),
|
||||
th_z, th_z + count, th_z,
|
||||
[slope] __device__ (const T& z) { return z / slope; },
|
||||
[] __device__ (const T& z) { return z < 0; });
|
||||
}
|
||||
|
||||
void leaky_relu_backward_cuda(at::Tensor z, at::Tensor dz, float slope) {
|
||||
CHECK_CUDA_INPUT(z);
|
||||
CHECK_CUDA_INPUT(dz);
|
||||
|
||||
int64_t count = z.numel();
|
||||
|
||||
AT_DISPATCH_FLOATING_TYPES(z.type(), "leaky_relu_backward_cuda", ([&] {
|
||||
leaky_relu_backward_impl<scalar_t>(z.data<scalar_t>(), dz.data<scalar_t>(), slope, count);
|
||||
}));
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
inline void elu_backward_impl(T *z, T *dz, int64_t count) {
|
||||
// Create thrust pointers
|
||||
thrust::device_ptr<T> th_z = thrust::device_pointer_cast(z);
|
||||
thrust::device_ptr<T> th_dz = thrust::device_pointer_cast(dz);
|
||||
|
||||
auto stream = at::cuda::getCurrentCUDAStream();
|
||||
thrust::transform_if(thrust::cuda::par.on(stream),
|
||||
th_dz, th_dz + count, th_z, th_z, th_dz,
|
||||
[] __device__ (const T& dz, const T& z) { return dz * (z + 1.); },
|
||||
[] __device__ (const T& z) { return z < 0; });
|
||||
thrust::transform_if(thrust::cuda::par.on(stream),
|
||||
th_z, th_z + count, th_z,
|
||||
[] __device__ (const T& z) { return log1p(z); },
|
||||
[] __device__ (const T& z) { return z < 0; });
|
||||
}
|
||||
|
||||
void elu_backward_cuda(at::Tensor z, at::Tensor dz) {
|
||||
CHECK_CUDA_INPUT(z);
|
||||
CHECK_CUDA_INPUT(dz);
|
||||
|
||||
int64_t count = z.numel();
|
||||
|
||||
AT_DISPATCH_FLOATING_TYPES(z.type(), "leaky_relu_backward_cuda", ([&] {
|
||||
elu_backward_impl<scalar_t>(z.data<scalar_t>(), dz.data<scalar_t>(), count);
|
||||
}));
|
||||
}
|
@@ -0,0 +1,275 @@
|
||||
#include <ATen/ATen.h>
|
||||
|
||||
#include <cuda_fp16.h>
|
||||
|
||||
#include <vector>
|
||||
|
||||
#include "utils/checks.h"
|
||||
#include "utils/cuda.cuh"
|
||||
#include "inplace_abn.h"
|
||||
|
||||
#include <ATen/cuda/CUDAContext.h>
|
||||
|
||||
// Operations for reduce
|
||||
struct SumOpH {
|
||||
__device__ SumOpH(const half *t, int c, int s)
|
||||
: tensor(t), chn(c), sp(s) {}
|
||||
__device__ __forceinline__ float operator()(int batch, int plane, int n) {
|
||||
return __half2float(tensor[(batch * chn + plane) * sp + n]);
|
||||
}
|
||||
const half *tensor;
|
||||
const int chn;
|
||||
const int sp;
|
||||
};
|
||||
|
||||
struct VarOpH {
|
||||
__device__ VarOpH(float m, const half *t, int c, int s)
|
||||
: mean(m), tensor(t), chn(c), sp(s) {}
|
||||
__device__ __forceinline__ float operator()(int batch, int plane, int n) {
|
||||
const auto t = __half2float(tensor[(batch * chn + plane) * sp + n]);
|
||||
return (t - mean) * (t - mean);
|
||||
}
|
||||
const float mean;
|
||||
const half *tensor;
|
||||
const int chn;
|
||||
const int sp;
|
||||
};
|
||||
|
||||
struct GradOpH {
|
||||
__device__ GradOpH(float _weight, float _bias, const half *_z, const half *_dz, int c, int s)
|
||||
: weight(_weight), bias(_bias), z(_z), dz(_dz), chn(c), sp(s) {}
|
||||
__device__ __forceinline__ Pair<float> operator()(int batch, int plane, int n) {
|
||||
float _y = (__half2float(z[(batch * chn + plane) * sp + n]) - bias) / weight;
|
||||
float _dz = __half2float(dz[(batch * chn + plane) * sp + n]);
|
||||
return Pair<float>(_dz, _y * _dz);
|
||||
}
|
||||
const float weight;
|
||||
const float bias;
|
||||
const half *z;
|
||||
const half *dz;
|
||||
const int chn;
|
||||
const int sp;
|
||||
};
|
||||
|
||||
/***********
|
||||
* mean_var
|
||||
***********/
|
||||
|
||||
__global__ void mean_var_kernel_h(const half *x, float *mean, float *var, int num, int chn, int sp) {
|
||||
int plane = blockIdx.x;
|
||||
float norm = 1.f / static_cast<float>(num * sp);
|
||||
|
||||
float _mean = reduce<float, SumOpH>(SumOpH(x, chn, sp), plane, num, sp) * norm;
|
||||
__syncthreads();
|
||||
float _var = reduce<float, VarOpH>(VarOpH(_mean, x, chn, sp), plane, num, sp) * norm;
|
||||
|
||||
if (threadIdx.x == 0) {
|
||||
mean[plane] = _mean;
|
||||
var[plane] = _var;
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<at::Tensor> mean_var_cuda_h(at::Tensor x) {
|
||||
CHECK_CUDA_INPUT(x);
|
||||
|
||||
// Extract dimensions
|
||||
int64_t num, chn, sp;
|
||||
get_dims(x, num, chn, sp);
|
||||
|
||||
// Prepare output tensors
|
||||
auto mean = at::empty({chn},x.options().dtype(at::kFloat));
|
||||
auto var = at::empty({chn},x.options().dtype(at::kFloat));
|
||||
|
||||
// Run kernel
|
||||
dim3 blocks(chn);
|
||||
dim3 threads(getNumThreads(sp));
|
||||
auto stream = at::cuda::getCurrentCUDAStream();
|
||||
mean_var_kernel_h<<<blocks, threads, 0, stream>>>(
|
||||
reinterpret_cast<half*>(x.data<at::Half>()),
|
||||
mean.data<float>(),
|
||||
var.data<float>(),
|
||||
num, chn, sp);
|
||||
|
||||
return {mean, var};
|
||||
}
|
||||
|
||||
/**********
|
||||
* forward
|
||||
**********/
|
||||
|
||||
__global__ void forward_kernel_h(half *x, const float *mean, const float *var, const float *weight, const float *bias,
|
||||
bool affine, float eps, int num, int chn, int sp) {
|
||||
int plane = blockIdx.x;
|
||||
|
||||
const float _mean = mean[plane];
|
||||
const float _var = var[plane];
|
||||
const float _weight = affine ? abs(weight[plane]) + eps : 1.f;
|
||||
const float _bias = affine ? bias[plane] : 0.f;
|
||||
|
||||
const float mul = rsqrt(_var + eps) * _weight;
|
||||
|
||||
for (int batch = 0; batch < num; ++batch) {
|
||||
for (int n = threadIdx.x; n < sp; n += blockDim.x) {
|
||||
half *x_ptr = x + (batch * chn + plane) * sp + n;
|
||||
float _x = __half2float(*x_ptr);
|
||||
float _y = (_x - _mean) * mul + _bias;
|
||||
|
||||
*x_ptr = __float2half(_y);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
at::Tensor forward_cuda_h(at::Tensor x, at::Tensor mean, at::Tensor var, at::Tensor weight, at::Tensor bias,
|
||||
bool affine, float eps) {
|
||||
CHECK_CUDA_INPUT(x);
|
||||
CHECK_CUDA_INPUT(mean);
|
||||
CHECK_CUDA_INPUT(var);
|
||||
CHECK_CUDA_INPUT(weight);
|
||||
CHECK_CUDA_INPUT(bias);
|
||||
|
||||
// Extract dimensions
|
||||
int64_t num, chn, sp;
|
||||
get_dims(x, num, chn, sp);
|
||||
|
||||
// Run kernel
|
||||
dim3 blocks(chn);
|
||||
dim3 threads(getNumThreads(sp));
|
||||
auto stream = at::cuda::getCurrentCUDAStream();
|
||||
forward_kernel_h<<<blocks, threads, 0, stream>>>(
|
||||
reinterpret_cast<half*>(x.data<at::Half>()),
|
||||
mean.data<float>(),
|
||||
var.data<float>(),
|
||||
weight.data<float>(),
|
||||
bias.data<float>(),
|
||||
affine, eps, num, chn, sp);
|
||||
|
||||
return x;
|
||||
}
|
||||
|
||||
__global__ void edz_eydz_kernel_h(const half *z, const half *dz, const float *weight, const float *bias,
|
||||
float *edz, float *eydz, bool affine, float eps, int num, int chn, int sp) {
|
||||
int plane = blockIdx.x;
|
||||
|
||||
float _weight = affine ? abs(weight[plane]) + eps : 1.f;
|
||||
float _bias = affine ? bias[plane] : 0.f;
|
||||
|
||||
Pair<float> res = reduce<Pair<float>, GradOpH>(GradOpH(_weight, _bias, z, dz, chn, sp), plane, num, sp);
|
||||
__syncthreads();
|
||||
|
||||
if (threadIdx.x == 0) {
|
||||
edz[plane] = res.v1;
|
||||
eydz[plane] = res.v2;
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<at::Tensor> edz_eydz_cuda_h(at::Tensor z, at::Tensor dz, at::Tensor weight, at::Tensor bias,
|
||||
bool affine, float eps) {
|
||||
CHECK_CUDA_INPUT(z);
|
||||
CHECK_CUDA_INPUT(dz);
|
||||
CHECK_CUDA_INPUT(weight);
|
||||
CHECK_CUDA_INPUT(bias);
|
||||
|
||||
// Extract dimensions
|
||||
int64_t num, chn, sp;
|
||||
get_dims(z, num, chn, sp);
|
||||
|
||||
auto edz = at::empty({chn},z.options().dtype(at::kFloat));
|
||||
auto eydz = at::empty({chn},z.options().dtype(at::kFloat));
|
||||
|
||||
// Run kernel
|
||||
dim3 blocks(chn);
|
||||
dim3 threads(getNumThreads(sp));
|
||||
auto stream = at::cuda::getCurrentCUDAStream();
|
||||
edz_eydz_kernel_h<<<blocks, threads, 0, stream>>>(
|
||||
reinterpret_cast<half*>(z.data<at::Half>()),
|
||||
reinterpret_cast<half*>(dz.data<at::Half>()),
|
||||
weight.data<float>(),
|
||||
bias.data<float>(),
|
||||
edz.data<float>(),
|
||||
eydz.data<float>(),
|
||||
affine, eps, num, chn, sp);
|
||||
|
||||
return {edz, eydz};
|
||||
}
|
||||
|
||||
__global__ void backward_kernel_h(const half *z, const half *dz, const float *var, const float *weight, const float *bias, const float *edz,
|
||||
const float *eydz, half *dx, bool affine, float eps, int num, int chn, int sp) {
|
||||
int plane = blockIdx.x;
|
||||
|
||||
float _weight = affine ? abs(weight[plane]) + eps : 1.f;
|
||||
float _bias = affine ? bias[plane] : 0.f;
|
||||
float _var = var[plane];
|
||||
float _edz = edz[plane];
|
||||
float _eydz = eydz[plane];
|
||||
|
||||
float _mul = _weight * rsqrt(_var + eps);
|
||||
float count = float(num * sp);
|
||||
|
||||
for (int batch = 0; batch < num; ++batch) {
|
||||
for (int n = threadIdx.x; n < sp; n += blockDim.x) {
|
||||
float _dz = __half2float(dz[(batch * chn + plane) * sp + n]);
|
||||
float _y = (__half2float(z[(batch * chn + plane) * sp + n]) - _bias) / _weight;
|
||||
|
||||
dx[(batch * chn + plane) * sp + n] = __float2half((_dz - _edz / count - _y * _eydz / count) * _mul);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
at::Tensor backward_cuda_h(at::Tensor z, at::Tensor dz, at::Tensor var, at::Tensor weight, at::Tensor bias,
|
||||
at::Tensor edz, at::Tensor eydz, bool affine, float eps) {
|
||||
CHECK_CUDA_INPUT(z);
|
||||
CHECK_CUDA_INPUT(dz);
|
||||
CHECK_CUDA_INPUT(var);
|
||||
CHECK_CUDA_INPUT(weight);
|
||||
CHECK_CUDA_INPUT(bias);
|
||||
CHECK_CUDA_INPUT(edz);
|
||||
CHECK_CUDA_INPUT(eydz);
|
||||
|
||||
// Extract dimensions
|
||||
int64_t num, chn, sp;
|
||||
get_dims(z, num, chn, sp);
|
||||
|
||||
auto dx = at::zeros_like(z);
|
||||
|
||||
// Run kernel
|
||||
dim3 blocks(chn);
|
||||
dim3 threads(getNumThreads(sp));
|
||||
auto stream = at::cuda::getCurrentCUDAStream();
|
||||
backward_kernel_h<<<blocks, threads, 0, stream>>>(
|
||||
reinterpret_cast<half*>(z.data<at::Half>()),
|
||||
reinterpret_cast<half*>(dz.data<at::Half>()),
|
||||
var.data<float>(),
|
||||
weight.data<float>(),
|
||||
bias.data<float>(),
|
||||
edz.data<float>(),
|
||||
eydz.data<float>(),
|
||||
reinterpret_cast<half*>(dx.data<at::Half>()),
|
||||
affine, eps, num, chn, sp);
|
||||
|
||||
return dx;
|
||||
}
|
||||
|
||||
__global__ void leaky_relu_backward_impl_h(half *z, half *dz, float slope, int64_t count) {
|
||||
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < count; i += blockDim.x * gridDim.x){
|
||||
float _z = __half2float(z[i]);
|
||||
if (_z < 0) {
|
||||
dz[i] = __float2half(__half2float(dz[i]) * slope);
|
||||
z[i] = __float2half(_z / slope);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void leaky_relu_backward_cuda_h(at::Tensor z, at::Tensor dz, float slope) {
|
||||
CHECK_CUDA_INPUT(z);
|
||||
CHECK_CUDA_INPUT(dz);
|
||||
|
||||
int64_t count = z.numel();
|
||||
dim3 threads(getNumThreads(count));
|
||||
dim3 blocks = (count + threads.x - 1) / threads.x;
|
||||
auto stream = at::cuda::getCurrentCUDAStream();
|
||||
leaky_relu_backward_impl_h<<<blocks, threads, 0, stream>>>(
|
||||
reinterpret_cast<half*>(z.data<at::Half>()),
|
||||
reinterpret_cast<half*>(dz.data<at::Half>()),
|
||||
slope, count);
|
||||
}
|
||||
|
15
vton-api/preprocess/humanparsing/modules/src/utils/checks.h
Normal file
15
vton-api/preprocess/humanparsing/modules/src/utils/checks.h
Normal file
@@ -0,0 +1,15 @@
|
||||
#pragma once
|
||||
|
||||
#include <ATen/ATen.h>
|
||||
|
||||
// Define AT_CHECK for old version of ATen where the same function was called AT_ASSERT
|
||||
#ifndef AT_CHECK
|
||||
#define AT_CHECK AT_ASSERT
|
||||
#endif
|
||||
|
||||
#define CHECK_CUDA(x) AT_CHECK((x).type().is_cuda(), #x " must be a CUDA tensor")
|
||||
#define CHECK_CPU(x) AT_CHECK(!(x).type().is_cuda(), #x " must be a CPU tensor")
|
||||
#define CHECK_CONTIGUOUS(x) AT_CHECK((x).is_contiguous(), #x " must be contiguous")
|
||||
|
||||
#define CHECK_CUDA_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x)
|
||||
#define CHECK_CPU_INPUT(x) CHECK_CPU(x); CHECK_CONTIGUOUS(x)
|
49
vton-api/preprocess/humanparsing/modules/src/utils/common.h
Normal file
49
vton-api/preprocess/humanparsing/modules/src/utils/common.h
Normal file
@@ -0,0 +1,49 @@
|
||||
#pragma once
|
||||
|
||||
#include <ATen/ATen.h>
|
||||
|
||||
/*
|
||||
* Functions to share code between CPU and GPU
|
||||
*/
|
||||
|
||||
#ifdef __CUDACC__
|
||||
// CUDA versions
|
||||
|
||||
#define HOST_DEVICE __host__ __device__
|
||||
#define INLINE_HOST_DEVICE __host__ __device__ inline
|
||||
#define FLOOR(x) floor(x)
|
||||
|
||||
#if __CUDA_ARCH__ >= 600
|
||||
// Recent compute capabilities have block-level atomicAdd for all data types, so we use that
|
||||
#define ACCUM(x,y) atomicAdd_block(&(x),(y))
|
||||
#else
|
||||
// Older architectures don't have block-level atomicAdd, nor atomicAdd for doubles, so we defer to atomicAdd for float
|
||||
// and use the known atomicCAS-based implementation for double
|
||||
template<typename data_t>
|
||||
__device__ inline data_t atomic_add(data_t *address, data_t val) {
|
||||
return atomicAdd(address, val);
|
||||
}
|
||||
|
||||
template<>
|
||||
__device__ inline double atomic_add(double *address, double val) {
|
||||
unsigned long long int* address_as_ull = (unsigned long long int*)address;
|
||||
unsigned long long int old = *address_as_ull, assumed;
|
||||
do {
|
||||
assumed = old;
|
||||
old = atomicCAS(address_as_ull, assumed, __double_as_longlong(val + __longlong_as_double(assumed)));
|
||||
} while (assumed != old);
|
||||
return __longlong_as_double(old);
|
||||
}
|
||||
|
||||
#define ACCUM(x,y) atomic_add(&(x),(y))
|
||||
#endif // #if __CUDA_ARCH__ >= 600
|
||||
|
||||
#else
|
||||
// CPU versions
|
||||
|
||||
#define HOST_DEVICE
|
||||
#define INLINE_HOST_DEVICE inline
|
||||
#define FLOOR(x) std::floor(x)
|
||||
#define ACCUM(x,y) (x) += (y)
|
||||
|
||||
#endif // #ifdef __CUDACC__
|
71
vton-api/preprocess/humanparsing/modules/src/utils/cuda.cuh
Normal file
71
vton-api/preprocess/humanparsing/modules/src/utils/cuda.cuh
Normal file
@@ -0,0 +1,71 @@
|
||||
#pragma once
|
||||
|
||||
/*
|
||||
* General settings and functions
|
||||
*/
|
||||
const int WARP_SIZE = 32;
|
||||
const int MAX_BLOCK_SIZE = 1024;
|
||||
|
||||
static int getNumThreads(int nElem) {
|
||||
int threadSizes[6] = {32, 64, 128, 256, 512, MAX_BLOCK_SIZE};
|
||||
for (int i = 0; i < 6; ++i) {
|
||||
if (nElem <= threadSizes[i]) {
|
||||
return threadSizes[i];
|
||||
}
|
||||
}
|
||||
return MAX_BLOCK_SIZE;
|
||||
}
|
||||
|
||||
/*
|
||||
* Reduction utilities
|
||||
*/
|
||||
template <typename T>
|
||||
__device__ __forceinline__ T WARP_SHFL_XOR(T value, int laneMask, int width = warpSize,
|
||||
unsigned int mask = 0xffffffff) {
|
||||
#if CUDART_VERSION >= 9000
|
||||
return __shfl_xor_sync(mask, value, laneMask, width);
|
||||
#else
|
||||
return __shfl_xor(value, laneMask, width);
|
||||
#endif
|
||||
}
|
||||
|
||||
__device__ __forceinline__ int getMSB(int val) { return 31 - __clz(val); }
|
||||
|
||||
template<typename T>
|
||||
struct Pair {
|
||||
T v1, v2;
|
||||
__device__ Pair() {}
|
||||
__device__ Pair(T _v1, T _v2) : v1(_v1), v2(_v2) {}
|
||||
__device__ Pair(T v) : v1(v), v2(v) {}
|
||||
__device__ Pair(int v) : v1(v), v2(v) {}
|
||||
__device__ Pair &operator+=(const Pair<T> &a) {
|
||||
v1 += a.v1;
|
||||
v2 += a.v2;
|
||||
return *this;
|
||||
}
|
||||
};
|
||||
|
||||
template<typename T>
|
||||
static __device__ __forceinline__ T warpSum(T val) {
|
||||
#if __CUDA_ARCH__ >= 300
|
||||
for (int i = 0; i < getMSB(WARP_SIZE); ++i) {
|
||||
val += WARP_SHFL_XOR(val, 1 << i, WARP_SIZE);
|
||||
}
|
||||
#else
|
||||
__shared__ T values[MAX_BLOCK_SIZE];
|
||||
values[threadIdx.x] = val;
|
||||
__threadfence_block();
|
||||
const int base = (threadIdx.x / WARP_SIZE) * WARP_SIZE;
|
||||
for (int i = 1; i < WARP_SIZE; i++) {
|
||||
val += values[base + ((i + threadIdx.x) % WARP_SIZE)];
|
||||
}
|
||||
#endif
|
||||
return val;
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
static __device__ __forceinline__ Pair<T> warpSum(Pair<T> value) {
|
||||
value.v1 = warpSum(value.v1);
|
||||
value.v2 = warpSum(value.v2);
|
||||
return value;
|
||||
}
|
Reference in New Issue
Block a user