Add at new repo again

This commit is contained in:
2025-01-28 21:48:35 +00:00
commit 6e660ddb3c
564 changed files with 75575 additions and 0 deletions

View File

@@ -0,0 +1,5 @@
from .bn import ABN, InPlaceABN, InPlaceABNSync
from .functions import ACT_RELU, ACT_LEAKY_RELU, ACT_ELU, ACT_NONE
from .misc import GlobalAvgPool2d, SingleGPU
from .residual import IdentityResidualBlock
from .dense import DenseModule

View File

@@ -0,0 +1,132 @@
import torch
import torch.nn as nn
import torch.nn.functional as functional
try:
from queue import Queue
except ImportError:
from Queue import Queue
from .functions import *
class ABN(nn.Module):
"""Activated Batch Normalization
This gathers a `BatchNorm2d` and an activation function in a single module
"""
def __init__(self, num_features, eps=1e-5, momentum=0.1, affine=True, activation="leaky_relu", slope=0.01):
"""Creates an Activated Batch Normalization module
Parameters
----------
num_features : int
Number of feature channels in the input and output.
eps : float
Small constant to prevent numerical issues.
momentum : float
Momentum factor applied to compute running statistics as.
affine : bool
If `True` apply learned scale and shift transformation after normalization.
activation : str
Name of the activation functions, one of: `leaky_relu`, `elu` or `none`.
slope : float
Negative slope for the `leaky_relu` activation.
"""
super(ABN, self).__init__()
self.num_features = num_features
self.affine = affine
self.eps = eps
self.momentum = momentum
self.activation = activation
self.slope = slope
if self.affine:
self.weight = nn.Parameter(torch.ones(num_features))
self.bias = nn.Parameter(torch.zeros(num_features))
else:
self.register_parameter('weight', None)
self.register_parameter('bias', None)
self.register_buffer('running_mean', torch.zeros(num_features))
self.register_buffer('running_var', torch.ones(num_features))
self.reset_parameters()
def reset_parameters(self):
nn.init.constant_(self.running_mean, 0)
nn.init.constant_(self.running_var, 1)
if self.affine:
nn.init.constant_(self.weight, 1)
nn.init.constant_(self.bias, 0)
def forward(self, x):
x = functional.batch_norm(x, self.running_mean, self.running_var, self.weight, self.bias,
self.training, self.momentum, self.eps)
if self.activation == ACT_RELU:
return functional.relu(x, inplace=True)
elif self.activation == ACT_LEAKY_RELU:
return functional.leaky_relu(x, negative_slope=self.slope, inplace=True)
elif self.activation == ACT_ELU:
return functional.elu(x, inplace=True)
else:
return x
def __repr__(self):
rep = '{name}({num_features}, eps={eps}, momentum={momentum},' \
' affine={affine}, activation={activation}'
if self.activation == "leaky_relu":
rep += ', slope={slope})'
else:
rep += ')'
return rep.format(name=self.__class__.__name__, **self.__dict__)
class InPlaceABN(ABN):
"""InPlace Activated Batch Normalization"""
def __init__(self, num_features, eps=1e-5, momentum=0.1, affine=True, activation="leaky_relu", slope=0.01):
"""Creates an InPlace Activated Batch Normalization module
Parameters
----------
num_features : int
Number of feature channels in the input and output.
eps : float
Small constant to prevent numerical issues.
momentum : float
Momentum factor applied to compute running statistics as.
affine : bool
If `True` apply learned scale and shift transformation after normalization.
activation : str
Name of the activation functions, one of: `leaky_relu`, `elu` or `none`.
slope : float
Negative slope for the `leaky_relu` activation.
"""
super(InPlaceABN, self).__init__(num_features, eps, momentum, affine, activation, slope)
def forward(self, x):
x, _, _ = inplace_abn(x, self.weight, self.bias, self.running_mean, self.running_var,
self.training, self.momentum, self.eps, self.activation, self.slope)
return x
class InPlaceABNSync(ABN):
"""InPlace Activated Batch Normalization with cross-GPU synchronization
This assumes that it will be replicated across GPUs using the same mechanism as in `nn.DistributedDataParallel`.
"""
def forward(self, x):
x, _, _ = inplace_abn_sync(x, self.weight, self.bias, self.running_mean, self.running_var,
self.training, self.momentum, self.eps, self.activation, self.slope)
return x
def __repr__(self):
rep = '{name}({num_features}, eps={eps}, momentum={momentum},' \
' affine={affine}, activation={activation}'
if self.activation == "leaky_relu":
rep += ', slope={slope})'
else:
rep += ')'
return rep.format(name=self.__class__.__name__, **self.__dict__)

View File

@@ -0,0 +1,84 @@
import torch
import torch.nn as nn
import torch.nn.functional as functional
from models._util import try_index
from .bn import ABN
class DeeplabV3(nn.Module):
def __init__(self,
in_channels,
out_channels,
hidden_channels=256,
dilations=(12, 24, 36),
norm_act=ABN,
pooling_size=None):
super(DeeplabV3, self).__init__()
self.pooling_size = pooling_size
self.map_convs = nn.ModuleList([
nn.Conv2d(in_channels, hidden_channels, 1, bias=False),
nn.Conv2d(in_channels, hidden_channels, 3, bias=False, dilation=dilations[0], padding=dilations[0]),
nn.Conv2d(in_channels, hidden_channels, 3, bias=False, dilation=dilations[1], padding=dilations[1]),
nn.Conv2d(in_channels, hidden_channels, 3, bias=False, dilation=dilations[2], padding=dilations[2])
])
self.map_bn = norm_act(hidden_channels * 4)
self.global_pooling_conv = nn.Conv2d(in_channels, hidden_channels, 1, bias=False)
self.global_pooling_bn = norm_act(hidden_channels)
self.red_conv = nn.Conv2d(hidden_channels * 4, out_channels, 1, bias=False)
self.pool_red_conv = nn.Conv2d(hidden_channels, out_channels, 1, bias=False)
self.red_bn = norm_act(out_channels)
self.reset_parameters(self.map_bn.activation, self.map_bn.slope)
def reset_parameters(self, activation, slope):
gain = nn.init.calculate_gain(activation, slope)
for m in self.modules():
if isinstance(m, nn.Conv2d):
nn.init.xavier_normal_(m.weight.data, gain)
if hasattr(m, "bias") and m.bias is not None:
nn.init.constant_(m.bias, 0)
elif isinstance(m, ABN):
if hasattr(m, "weight") and m.weight is not None:
nn.init.constant_(m.weight, 1)
if hasattr(m, "bias") and m.bias is not None:
nn.init.constant_(m.bias, 0)
def forward(self, x):
# Map convolutions
out = torch.cat([m(x) for m in self.map_convs], dim=1)
out = self.map_bn(out)
out = self.red_conv(out)
# Global pooling
pool = self._global_pooling(x)
pool = self.global_pooling_conv(pool)
pool = self.global_pooling_bn(pool)
pool = self.pool_red_conv(pool)
if self.training or self.pooling_size is None:
pool = pool.repeat(1, 1, x.size(2), x.size(3))
out += pool
out = self.red_bn(out)
return out
def _global_pooling(self, x):
if self.training or self.pooling_size is None:
pool = x.view(x.size(0), x.size(1), -1).mean(dim=-1)
pool = pool.view(x.size(0), x.size(1), 1, 1)
else:
pooling_size = (min(try_index(self.pooling_size, 0), x.shape[2]),
min(try_index(self.pooling_size, 1), x.shape[3]))
padding = (
(pooling_size[1] - 1) // 2,
(pooling_size[1] - 1) // 2 if pooling_size[1] % 2 == 1 else (pooling_size[1] - 1) // 2 + 1,
(pooling_size[0] - 1) // 2,
(pooling_size[0] - 1) // 2 if pooling_size[0] % 2 == 1 else (pooling_size[0] - 1) // 2 + 1
)
pool = functional.avg_pool2d(x, pooling_size, stride=1)
pool = functional.pad(pool, pad=padding, mode="replicate")
return pool

View File

@@ -0,0 +1,42 @@
from collections import OrderedDict
import torch
import torch.nn as nn
from .bn import ABN
class DenseModule(nn.Module):
def __init__(self, in_channels, growth, layers, bottleneck_factor=4, norm_act=ABN, dilation=1):
super(DenseModule, self).__init__()
self.in_channels = in_channels
self.growth = growth
self.layers = layers
self.convs1 = nn.ModuleList()
self.convs3 = nn.ModuleList()
for i in range(self.layers):
self.convs1.append(nn.Sequential(OrderedDict([
("bn", norm_act(in_channels)),
("conv", nn.Conv2d(in_channels, self.growth * bottleneck_factor, 1, bias=False))
])))
self.convs3.append(nn.Sequential(OrderedDict([
("bn", norm_act(self.growth * bottleneck_factor)),
("conv", nn.Conv2d(self.growth * bottleneck_factor, self.growth, 3, padding=dilation, bias=False,
dilation=dilation))
])))
in_channels += self.growth
@property
def out_channels(self):
return self.in_channels + self.growth * self.layers
def forward(self, x):
inputs = [x]
for i in range(self.layers):
x = torch.cat(inputs, dim=1)
x = self.convs1[i](x)
x = self.convs3[i](x)
inputs += [x]
return torch.cat(inputs, dim=1)

View File

@@ -0,0 +1,245 @@
import pdb
from os import path
import torch
import torch.distributed as dist
import torch.autograd as autograd
import torch.cuda.comm as comm
from torch.autograd.function import once_differentiable
from torch.utils.cpp_extension import load
_src_path = path.join(path.dirname(path.abspath(__file__)), "src")
_backend = load(name="inplace_abn",
extra_cflags=["-O3"],
sources=[path.join(_src_path, f) for f in [
"inplace_abn.cpp",
"inplace_abn_cpu.cpp",
"inplace_abn_cuda.cu",
"inplace_abn_cuda_half.cu"
]],
extra_cuda_cflags=["--expt-extended-lambda"])
# Activation names
ACT_RELU = "relu"
ACT_LEAKY_RELU = "leaky_relu"
ACT_ELU = "elu"
ACT_NONE = "none"
def _check(fn, *args, **kwargs):
success = fn(*args, **kwargs)
if not success:
raise RuntimeError("CUDA Error encountered in {}".format(fn))
def _broadcast_shape(x):
out_size = []
for i, s in enumerate(x.size()):
if i != 1:
out_size.append(1)
else:
out_size.append(s)
return out_size
def _reduce(x):
if len(x.size()) == 2:
return x.sum(dim=0)
else:
n, c = x.size()[0:2]
return x.contiguous().view((n, c, -1)).sum(2).sum(0)
def _count_samples(x):
count = 1
for i, s in enumerate(x.size()):
if i != 1:
count *= s
return count
def _act_forward(ctx, x):
if ctx.activation == ACT_LEAKY_RELU:
_backend.leaky_relu_forward(x, ctx.slope)
elif ctx.activation == ACT_ELU:
_backend.elu_forward(x)
elif ctx.activation == ACT_NONE:
pass
def _act_backward(ctx, x, dx):
if ctx.activation == ACT_LEAKY_RELU:
_backend.leaky_relu_backward(x, dx, ctx.slope)
elif ctx.activation == ACT_ELU:
_backend.elu_backward(x, dx)
elif ctx.activation == ACT_NONE:
pass
class InPlaceABN(autograd.Function):
@staticmethod
def forward(ctx, x, weight, bias, running_mean, running_var,
training=True, momentum=0.1, eps=1e-05, activation=ACT_LEAKY_RELU, slope=0.01):
# Save context
ctx.training = training
ctx.momentum = momentum
ctx.eps = eps
ctx.activation = activation
ctx.slope = slope
ctx.affine = weight is not None and bias is not None
# Prepare inputs
count = _count_samples(x)
x = x.contiguous()
weight = weight.contiguous() if ctx.affine else x.new_empty(0)
bias = bias.contiguous() if ctx.affine else x.new_empty(0)
if ctx.training:
mean, var = _backend.mean_var(x)
# Update running stats
running_mean.mul_((1 - ctx.momentum)).add_(ctx.momentum * mean)
running_var.mul_((1 - ctx.momentum)).add_(ctx.momentum * var * count / (count - 1))
# Mark in-place modified tensors
ctx.mark_dirty(x, running_mean, running_var)
else:
mean, var = running_mean.contiguous(), running_var.contiguous()
ctx.mark_dirty(x)
# BN forward + activation
_backend.forward(x, mean, var, weight, bias, ctx.affine, ctx.eps)
_act_forward(ctx, x)
# Output
ctx.var = var
ctx.save_for_backward(x, var, weight, bias)
ctx.mark_non_differentiable(running_mean, running_var)
return x, running_mean, running_var
@staticmethod
@once_differentiable
def backward(ctx, dz, _drunning_mean, _drunning_var):
z, var, weight, bias = ctx.saved_tensors
dz = dz.contiguous()
# Undo activation
_act_backward(ctx, z, dz)
if ctx.training:
edz, eydz = _backend.edz_eydz(z, dz, weight, bias, ctx.affine, ctx.eps)
else:
# TODO: implement simplified CUDA backward for inference mode
edz = dz.new_zeros(dz.size(1))
eydz = dz.new_zeros(dz.size(1))
dx = _backend.backward(z, dz, var, weight, bias, edz, eydz, ctx.affine, ctx.eps)
# dweight = eydz * weight.sign() if ctx.affine else None
dweight = eydz if ctx.affine else None
if dweight is not None:
dweight[weight < 0] *= -1
dbias = edz if ctx.affine else None
return dx, dweight, dbias, None, None, None, None, None, None, None
class InPlaceABNSync(autograd.Function):
@classmethod
def forward(cls, ctx, x, weight, bias, running_mean, running_var,
training=True, momentum=0.1, eps=1e-05, activation=ACT_LEAKY_RELU, slope=0.01, equal_batches=True):
# Save context
ctx.training = training
ctx.momentum = momentum
ctx.eps = eps
ctx.activation = activation
ctx.slope = slope
ctx.affine = weight is not None and bias is not None
# Prepare inputs
ctx.world_size = dist.get_world_size() if dist.is_initialized() else 1
# count = _count_samples(x)
batch_size = x.new_tensor([x.shape[0]], dtype=torch.long)
x = x.contiguous()
weight = weight.contiguous() if ctx.affine else x.new_empty(0)
bias = bias.contiguous() if ctx.affine else x.new_empty(0)
if ctx.training:
mean, var = _backend.mean_var(x)
if ctx.world_size > 1:
# get global batch size
if equal_batches:
batch_size *= ctx.world_size
else:
dist.all_reduce(batch_size, dist.ReduceOp.SUM)
ctx.factor = x.shape[0] / float(batch_size.item())
mean_all = mean.clone() * ctx.factor
dist.all_reduce(mean_all, dist.ReduceOp.SUM)
var_all = (var + (mean - mean_all) ** 2) * ctx.factor
dist.all_reduce(var_all, dist.ReduceOp.SUM)
mean = mean_all
var = var_all
# Update running stats
running_mean.mul_((1 - ctx.momentum)).add_(ctx.momentum * mean)
count = batch_size.item() * x.view(x.shape[0], x.shape[1], -1).shape[-1]
running_var.mul_((1 - ctx.momentum)).add_(ctx.momentum * var * (float(count) / (count - 1)))
# Mark in-place modified tensors
ctx.mark_dirty(x, running_mean, running_var)
else:
mean, var = running_mean.contiguous(), running_var.contiguous()
ctx.mark_dirty(x)
# BN forward + activation
_backend.forward(x, mean, var, weight, bias, ctx.affine, ctx.eps)
_act_forward(ctx, x)
# Output
ctx.var = var
ctx.save_for_backward(x, var, weight, bias)
ctx.mark_non_differentiable(running_mean, running_var)
return x, running_mean, running_var
@staticmethod
@once_differentiable
def backward(ctx, dz, _drunning_mean, _drunning_var):
z, var, weight, bias = ctx.saved_tensors
dz = dz.contiguous()
# Undo activation
_act_backward(ctx, z, dz)
if ctx.training:
edz, eydz = _backend.edz_eydz(z, dz, weight, bias, ctx.affine, ctx.eps)
edz_local = edz.clone()
eydz_local = eydz.clone()
if ctx.world_size > 1:
edz *= ctx.factor
dist.all_reduce(edz, dist.ReduceOp.SUM)
eydz *= ctx.factor
dist.all_reduce(eydz, dist.ReduceOp.SUM)
else:
edz_local = edz = dz.new_zeros(dz.size(1))
eydz_local = eydz = dz.new_zeros(dz.size(1))
dx = _backend.backward(z, dz, var, weight, bias, edz, eydz, ctx.affine, ctx.eps)
# dweight = eydz_local * weight.sign() if ctx.affine else None
dweight = eydz_local if ctx.affine else None
if dweight is not None:
dweight[weight < 0] *= -1
dbias = edz_local if ctx.affine else None
return dx, dweight, dbias, None, None, None, None, None, None, None
inplace_abn = InPlaceABN.apply
inplace_abn_sync = InPlaceABNSync.apply
__all__ = ["inplace_abn", "inplace_abn_sync", "ACT_RELU", "ACT_LEAKY_RELU", "ACT_ELU", "ACT_NONE"]

View File

@@ -0,0 +1,21 @@
import torch.nn as nn
import torch
import torch.distributed as dist
class GlobalAvgPool2d(nn.Module):
def __init__(self):
"""Global average pooling over the input's spatial dimensions"""
super(GlobalAvgPool2d, self).__init__()
def forward(self, inputs):
in_size = inputs.size()
return inputs.view((in_size[0], in_size[1], -1)).mean(dim=2)
class SingleGPU(nn.Module):
def __init__(self, module):
super(SingleGPU, self).__init__()
self.module=module
def forward(self, input):
return self.module(input.cuda(non_blocking=True))

View File

@@ -0,0 +1,182 @@
from collections import OrderedDict
import torch.nn as nn
from .bn import ABN, ACT_LEAKY_RELU, ACT_ELU, ACT_NONE
import torch.nn.functional as functional
class ResidualBlock(nn.Module):
"""Configurable residual block
Parameters
----------
in_channels : int
Number of input channels.
channels : list of int
Number of channels in the internal feature maps. Can either have two or three elements: if three construct
a residual block with two `3 x 3` convolutions, otherwise construct a bottleneck block with `1 x 1`, then
`3 x 3` then `1 x 1` convolutions.
stride : int
Stride of the first `3 x 3` convolution
dilation : int
Dilation to apply to the `3 x 3` convolutions.
groups : int
Number of convolution groups. This is used to create ResNeXt-style blocks and is only compatible with
bottleneck blocks.
norm_act : callable
Function to create normalization / activation Module.
dropout: callable
Function to create Dropout Module.
"""
def __init__(self,
in_channels,
channels,
stride=1,
dilation=1,
groups=1,
norm_act=ABN,
dropout=None):
super(ResidualBlock, self).__init__()
# Check parameters for inconsistencies
if len(channels) != 2 and len(channels) != 3:
raise ValueError("channels must contain either two or three values")
if len(channels) == 2 and groups != 1:
raise ValueError("groups > 1 are only valid if len(channels) == 3")
is_bottleneck = len(channels) == 3
need_proj_conv = stride != 1 or in_channels != channels[-1]
if not is_bottleneck:
bn2 = norm_act(channels[1])
bn2.activation = ACT_NONE
layers = [
("conv1", nn.Conv2d(in_channels, channels[0], 3, stride=stride, padding=dilation, bias=False,
dilation=dilation)),
("bn1", norm_act(channels[0])),
("conv2", nn.Conv2d(channels[0], channels[1], 3, stride=1, padding=dilation, bias=False,
dilation=dilation)),
("bn2", bn2)
]
if dropout is not None:
layers = layers[0:2] + [("dropout", dropout())] + layers[2:]
else:
bn3 = norm_act(channels[2])
bn3.activation = ACT_NONE
layers = [
("conv1", nn.Conv2d(in_channels, channels[0], 1, stride=1, padding=0, bias=False)),
("bn1", norm_act(channels[0])),
("conv2", nn.Conv2d(channels[0], channels[1], 3, stride=stride, padding=dilation, bias=False,
groups=groups, dilation=dilation)),
("bn2", norm_act(channels[1])),
("conv3", nn.Conv2d(channels[1], channels[2], 1, stride=1, padding=0, bias=False)),
("bn3", bn3)
]
if dropout is not None:
layers = layers[0:4] + [("dropout", dropout())] + layers[4:]
self.convs = nn.Sequential(OrderedDict(layers))
if need_proj_conv:
self.proj_conv = nn.Conv2d(in_channels, channels[-1], 1, stride=stride, padding=0, bias=False)
self.proj_bn = norm_act(channels[-1])
self.proj_bn.activation = ACT_NONE
def forward(self, x):
if hasattr(self, "proj_conv"):
residual = self.proj_conv(x)
residual = self.proj_bn(residual)
else:
residual = x
x = self.convs(x) + residual
if self.convs.bn1.activation == ACT_LEAKY_RELU:
return functional.leaky_relu(x, negative_slope=self.convs.bn1.slope, inplace=True)
elif self.convs.bn1.activation == ACT_ELU:
return functional.elu(x, inplace=True)
else:
return x
class IdentityResidualBlock(nn.Module):
def __init__(self,
in_channels,
channels,
stride=1,
dilation=1,
groups=1,
norm_act=ABN,
dropout=None):
"""Configurable identity-mapping residual block
Parameters
----------
in_channels : int
Number of input channels.
channels : list of int
Number of channels in the internal feature maps. Can either have two or three elements: if three construct
a residual block with two `3 x 3` convolutions, otherwise construct a bottleneck block with `1 x 1`, then
`3 x 3` then `1 x 1` convolutions.
stride : int
Stride of the first `3 x 3` convolution
dilation : int
Dilation to apply to the `3 x 3` convolutions.
groups : int
Number of convolution groups. This is used to create ResNeXt-style blocks and is only compatible with
bottleneck blocks.
norm_act : callable
Function to create normalization / activation Module.
dropout: callable
Function to create Dropout Module.
"""
super(IdentityResidualBlock, self).__init__()
# Check parameters for inconsistencies
if len(channels) != 2 and len(channels) != 3:
raise ValueError("channels must contain either two or three values")
if len(channels) == 2 and groups != 1:
raise ValueError("groups > 1 are only valid if len(channels) == 3")
is_bottleneck = len(channels) == 3
need_proj_conv = stride != 1 or in_channels != channels[-1]
self.bn1 = norm_act(in_channels)
if not is_bottleneck:
layers = [
("conv1", nn.Conv2d(in_channels, channels[0], 3, stride=stride, padding=dilation, bias=False,
dilation=dilation)),
("bn2", norm_act(channels[0])),
("conv2", nn.Conv2d(channels[0], channels[1], 3, stride=1, padding=dilation, bias=False,
dilation=dilation))
]
if dropout is not None:
layers = layers[0:2] + [("dropout", dropout())] + layers[2:]
else:
layers = [
("conv1", nn.Conv2d(in_channels, channels[0], 1, stride=stride, padding=0, bias=False)),
("bn2", norm_act(channels[0])),
("conv2", nn.Conv2d(channels[0], channels[1], 3, stride=1, padding=dilation, bias=False,
groups=groups, dilation=dilation)),
("bn3", norm_act(channels[1])),
("conv3", nn.Conv2d(channels[1], channels[2], 1, stride=1, padding=0, bias=False))
]
if dropout is not None:
layers = layers[0:4] + [("dropout", dropout())] + layers[4:]
self.convs = nn.Sequential(OrderedDict(layers))
if need_proj_conv:
self.proj_conv = nn.Conv2d(in_channels, channels[-1], 1, stride=stride, padding=0, bias=False)
def forward(self, x):
if hasattr(self, "proj_conv"):
bn1 = self.bn1(x)
shortcut = self.proj_conv(bn1)
else:
shortcut = x.clone()
bn1 = self.bn1(x)
out = self.convs(bn1)
out.add_(shortcut)
return out

View File

@@ -0,0 +1,15 @@
#pragma once
#include <ATen/ATen.h>
// Define AT_CHECK for old version of ATen where the same function was called AT_ASSERT
#ifndef AT_CHECK
#define AT_CHECK AT_ASSERT
#endif
#define CHECK_CUDA(x) AT_CHECK((x).type().is_cuda(), #x " must be a CUDA tensor")
#define CHECK_CPU(x) AT_CHECK(!(x).type().is_cuda(), #x " must be a CPU tensor")
#define CHECK_CONTIGUOUS(x) AT_CHECK((x).is_contiguous(), #x " must be contiguous")
#define CHECK_CUDA_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x)
#define CHECK_CPU_INPUT(x) CHECK_CPU(x); CHECK_CONTIGUOUS(x)

View File

@@ -0,0 +1,95 @@
#include <torch/extension.h>
#include <vector>
#include "inplace_abn.h"
std::vector<at::Tensor> mean_var(at::Tensor x) {
if (x.is_cuda()) {
if (x.type().scalarType() == at::ScalarType::Half) {
return mean_var_cuda_h(x);
} else {
return mean_var_cuda(x);
}
} else {
return mean_var_cpu(x);
}
}
at::Tensor forward(at::Tensor x, at::Tensor mean, at::Tensor var, at::Tensor weight, at::Tensor bias,
bool affine, float eps) {
if (x.is_cuda()) {
if (x.type().scalarType() == at::ScalarType::Half) {
return forward_cuda_h(x, mean, var, weight, bias, affine, eps);
} else {
return forward_cuda(x, mean, var, weight, bias, affine, eps);
}
} else {
return forward_cpu(x, mean, var, weight, bias, affine, eps);
}
}
std::vector<at::Tensor> edz_eydz(at::Tensor z, at::Tensor dz, at::Tensor weight, at::Tensor bias,
bool affine, float eps) {
if (z.is_cuda()) {
if (z.type().scalarType() == at::ScalarType::Half) {
return edz_eydz_cuda_h(z, dz, weight, bias, affine, eps);
} else {
return edz_eydz_cuda(z, dz, weight, bias, affine, eps);
}
} else {
return edz_eydz_cpu(z, dz, weight, bias, affine, eps);
}
}
at::Tensor backward(at::Tensor z, at::Tensor dz, at::Tensor var, at::Tensor weight, at::Tensor bias,
at::Tensor edz, at::Tensor eydz, bool affine, float eps) {
if (z.is_cuda()) {
if (z.type().scalarType() == at::ScalarType::Half) {
return backward_cuda_h(z, dz, var, weight, bias, edz, eydz, affine, eps);
} else {
return backward_cuda(z, dz, var, weight, bias, edz, eydz, affine, eps);
}
} else {
return backward_cpu(z, dz, var, weight, bias, edz, eydz, affine, eps);
}
}
void leaky_relu_forward(at::Tensor z, float slope) {
at::leaky_relu_(z, slope);
}
void leaky_relu_backward(at::Tensor z, at::Tensor dz, float slope) {
if (z.is_cuda()) {
if (z.type().scalarType() == at::ScalarType::Half) {
return leaky_relu_backward_cuda_h(z, dz, slope);
} else {
return leaky_relu_backward_cuda(z, dz, slope);
}
} else {
return leaky_relu_backward_cpu(z, dz, slope);
}
}
void elu_forward(at::Tensor z) {
at::elu_(z);
}
void elu_backward(at::Tensor z, at::Tensor dz) {
if (z.is_cuda()) {
return elu_backward_cuda(z, dz);
} else {
return elu_backward_cpu(z, dz);
}
}
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
m.def("mean_var", &mean_var, "Mean and variance computation");
m.def("forward", &forward, "In-place forward computation");
m.def("edz_eydz", &edz_eydz, "First part of backward computation");
m.def("backward", &backward, "Second part of backward computation");
m.def("leaky_relu_forward", &leaky_relu_forward, "Leaky relu forward computation");
m.def("leaky_relu_backward", &leaky_relu_backward, "Leaky relu backward computation and inversion");
m.def("elu_forward", &elu_forward, "Elu forward computation");
m.def("elu_backward", &elu_backward, "Elu backward computation and inversion");
}

View File

@@ -0,0 +1,88 @@
#pragma once
#include <ATen/ATen.h>
#include <vector>
std::vector<at::Tensor> mean_var_cpu(at::Tensor x);
std::vector<at::Tensor> mean_var_cuda(at::Tensor x);
std::vector<at::Tensor> mean_var_cuda_h(at::Tensor x);
at::Tensor forward_cpu(at::Tensor x, at::Tensor mean, at::Tensor var, at::Tensor weight, at::Tensor bias,
bool affine, float eps);
at::Tensor forward_cuda(at::Tensor x, at::Tensor mean, at::Tensor var, at::Tensor weight, at::Tensor bias,
bool affine, float eps);
at::Tensor forward_cuda_h(at::Tensor x, at::Tensor mean, at::Tensor var, at::Tensor weight, at::Tensor bias,
bool affine, float eps);
std::vector<at::Tensor> edz_eydz_cpu(at::Tensor z, at::Tensor dz, at::Tensor weight, at::Tensor bias,
bool affine, float eps);
std::vector<at::Tensor> edz_eydz_cuda(at::Tensor z, at::Tensor dz, at::Tensor weight, at::Tensor bias,
bool affine, float eps);
std::vector<at::Tensor> edz_eydz_cuda_h(at::Tensor z, at::Tensor dz, at::Tensor weight, at::Tensor bias,
bool affine, float eps);
at::Tensor backward_cpu(at::Tensor z, at::Tensor dz, at::Tensor var, at::Tensor weight, at::Tensor bias,
at::Tensor edz, at::Tensor eydz, bool affine, float eps);
at::Tensor backward_cuda(at::Tensor z, at::Tensor dz, at::Tensor var, at::Tensor weight, at::Tensor bias,
at::Tensor edz, at::Tensor eydz, bool affine, float eps);
at::Tensor backward_cuda_h(at::Tensor z, at::Tensor dz, at::Tensor var, at::Tensor weight, at::Tensor bias,
at::Tensor edz, at::Tensor eydz, bool affine, float eps);
void leaky_relu_backward_cpu(at::Tensor z, at::Tensor dz, float slope);
void leaky_relu_backward_cuda(at::Tensor z, at::Tensor dz, float slope);
void leaky_relu_backward_cuda_h(at::Tensor z, at::Tensor dz, float slope);
void elu_backward_cpu(at::Tensor z, at::Tensor dz);
void elu_backward_cuda(at::Tensor z, at::Tensor dz);
static void get_dims(at::Tensor x, int64_t& num, int64_t& chn, int64_t& sp) {
num = x.size(0);
chn = x.size(1);
sp = 1;
for (int64_t i = 2; i < x.ndimension(); ++i)
sp *= x.size(i);
}
/*
* Specialized CUDA reduction functions for BN
*/
#ifdef __CUDACC__
#include "utils/cuda.cuh"
template <typename T, typename Op>
__device__ T reduce(Op op, int plane, int N, int S) {
T sum = (T)0;
for (int batch = 0; batch < N; ++batch) {
for (int x = threadIdx.x; x < S; x += blockDim.x) {
sum += op(batch, plane, x);
}
}
// sum over NumThreads within a warp
sum = warpSum(sum);
// 'transpose', and reduce within warp again
__shared__ T shared[32];
__syncthreads();
if (threadIdx.x % WARP_SIZE == 0) {
shared[threadIdx.x / WARP_SIZE] = sum;
}
if (threadIdx.x >= blockDim.x / WARP_SIZE && threadIdx.x < WARP_SIZE) {
// zero out the other entries in shared
shared[threadIdx.x] = (T)0;
}
__syncthreads();
if (threadIdx.x / WARP_SIZE == 0) {
sum = warpSum(shared[threadIdx.x]);
if (threadIdx.x == 0) {
shared[0] = sum;
}
}
__syncthreads();
// Everyone picks it up, should be broadcast into the whole gradInput
return shared[0];
}
#endif

View File

@@ -0,0 +1,119 @@
#include <ATen/ATen.h>
#include <vector>
#include "utils/checks.h"
#include "inplace_abn.h"
at::Tensor reduce_sum(at::Tensor x) {
if (x.ndimension() == 2) {
return x.sum(0);
} else {
auto x_view = x.view({x.size(0), x.size(1), -1});
return x_view.sum(-1).sum(0);
}
}
at::Tensor broadcast_to(at::Tensor v, at::Tensor x) {
if (x.ndimension() == 2) {
return v;
} else {
std::vector<int64_t> broadcast_size = {1, -1};
for (int64_t i = 2; i < x.ndimension(); ++i)
broadcast_size.push_back(1);
return v.view(broadcast_size);
}
}
int64_t count(at::Tensor x) {
int64_t count = x.size(0);
for (int64_t i = 2; i < x.ndimension(); ++i)
count *= x.size(i);
return count;
}
at::Tensor invert_affine(at::Tensor z, at::Tensor weight, at::Tensor bias, bool affine, float eps) {
if (affine) {
return (z - broadcast_to(bias, z)) / broadcast_to(at::abs(weight) + eps, z);
} else {
return z;
}
}
std::vector<at::Tensor> mean_var_cpu(at::Tensor x) {
auto num = count(x);
auto mean = reduce_sum(x) / num;
auto diff = x - broadcast_to(mean, x);
auto var = reduce_sum(diff.pow(2)) / num;
return {mean, var};
}
at::Tensor forward_cpu(at::Tensor x, at::Tensor mean, at::Tensor var, at::Tensor weight, at::Tensor bias,
bool affine, float eps) {
auto gamma = affine ? at::abs(weight) + eps : at::ones_like(var);
auto mul = at::rsqrt(var + eps) * gamma;
x.sub_(broadcast_to(mean, x));
x.mul_(broadcast_to(mul, x));
if (affine) x.add_(broadcast_to(bias, x));
return x;
}
std::vector<at::Tensor> edz_eydz_cpu(at::Tensor z, at::Tensor dz, at::Tensor weight, at::Tensor bias,
bool affine, float eps) {
auto edz = reduce_sum(dz);
auto y = invert_affine(z, weight, bias, affine, eps);
auto eydz = reduce_sum(y * dz);
return {edz, eydz};
}
at::Tensor backward_cpu(at::Tensor z, at::Tensor dz, at::Tensor var, at::Tensor weight, at::Tensor bias,
at::Tensor edz, at::Tensor eydz, bool affine, float eps) {
auto y = invert_affine(z, weight, bias, affine, eps);
auto mul = affine ? at::rsqrt(var + eps) * (at::abs(weight) + eps) : at::rsqrt(var + eps);
auto num = count(z);
auto dx = (dz - broadcast_to(edz / num, dz) - y * broadcast_to(eydz / num, dz)) * broadcast_to(mul, dz);
return dx;
}
void leaky_relu_backward_cpu(at::Tensor z, at::Tensor dz, float slope) {
CHECK_CPU_INPUT(z);
CHECK_CPU_INPUT(dz);
AT_DISPATCH_FLOATING_TYPES(z.type(), "leaky_relu_backward_cpu", ([&] {
int64_t count = z.numel();
auto *_z = z.data<scalar_t>();
auto *_dz = dz.data<scalar_t>();
for (int64_t i = 0; i < count; ++i) {
if (_z[i] < 0) {
_z[i] *= 1 / slope;
_dz[i] *= slope;
}
}
}));
}
void elu_backward_cpu(at::Tensor z, at::Tensor dz) {
CHECK_CPU_INPUT(z);
CHECK_CPU_INPUT(dz);
AT_DISPATCH_FLOATING_TYPES(z.type(), "elu_backward_cpu", ([&] {
int64_t count = z.numel();
auto *_z = z.data<scalar_t>();
auto *_dz = dz.data<scalar_t>();
for (int64_t i = 0; i < count; ++i) {
if (_z[i] < 0) {
_z[i] = log1p(_z[i]);
_dz[i] *= (_z[i] + 1.f);
}
}
}));
}

View File

@@ -0,0 +1,333 @@
#include <ATen/ATen.h>
#include <thrust/device_ptr.h>
#include <thrust/transform.h>
#include <vector>
#include "utils/checks.h"
#include "utils/cuda.cuh"
#include "inplace_abn.h"
#include <ATen/cuda/CUDAContext.h>
// Operations for reduce
template<typename T>
struct SumOp {
__device__ SumOp(const T *t, int c, int s)
: tensor(t), chn(c), sp(s) {}
__device__ __forceinline__ T operator()(int batch, int plane, int n) {
return tensor[(batch * chn + plane) * sp + n];
}
const T *tensor;
const int chn;
const int sp;
};
template<typename T>
struct VarOp {
__device__ VarOp(T m, const T *t, int c, int s)
: mean(m), tensor(t), chn(c), sp(s) {}
__device__ __forceinline__ T operator()(int batch, int plane, int n) {
T val = tensor[(batch * chn + plane) * sp + n];
return (val - mean) * (val - mean);
}
const T mean;
const T *tensor;
const int chn;
const int sp;
};
template<typename T>
struct GradOp {
__device__ GradOp(T _weight, T _bias, const T *_z, const T *_dz, int c, int s)
: weight(_weight), bias(_bias), z(_z), dz(_dz), chn(c), sp(s) {}
__device__ __forceinline__ Pair<T> operator()(int batch, int plane, int n) {
T _y = (z[(batch * chn + plane) * sp + n] - bias) / weight;
T _dz = dz[(batch * chn + plane) * sp + n];
return Pair<T>(_dz, _y * _dz);
}
const T weight;
const T bias;
const T *z;
const T *dz;
const int chn;
const int sp;
};
/***********
* mean_var
***********/
template<typename T>
__global__ void mean_var_kernel(const T *x, T *mean, T *var, int num, int chn, int sp) {
int plane = blockIdx.x;
T norm = T(1) / T(num * sp);
T _mean = reduce<T, SumOp<T>>(SumOp<T>(x, chn, sp), plane, num, sp) * norm;
__syncthreads();
T _var = reduce<T, VarOp<T>>(VarOp<T>(_mean, x, chn, sp), plane, num, sp) * norm;
if (threadIdx.x == 0) {
mean[plane] = _mean;
var[plane] = _var;
}
}
std::vector<at::Tensor> mean_var_cuda(at::Tensor x) {
CHECK_CUDA_INPUT(x);
// Extract dimensions
int64_t num, chn, sp;
get_dims(x, num, chn, sp);
// Prepare output tensors
auto mean = at::empty({chn}, x.options());
auto var = at::empty({chn}, x.options());
// Run kernel
dim3 blocks(chn);
dim3 threads(getNumThreads(sp));
auto stream = at::cuda::getCurrentCUDAStream();
AT_DISPATCH_FLOATING_TYPES(x.type(), "mean_var_cuda", ([&] {
mean_var_kernel<scalar_t><<<blocks, threads, 0, stream>>>(
x.data<scalar_t>(),
mean.data<scalar_t>(),
var.data<scalar_t>(),
num, chn, sp);
}));
return {mean, var};
}
/**********
* forward
**********/
template<typename T>
__global__ void forward_kernel(T *x, const T *mean, const T *var, const T *weight, const T *bias,
bool affine, float eps, int num, int chn, int sp) {
int plane = blockIdx.x;
T _mean = mean[plane];
T _var = var[plane];
T _weight = affine ? abs(weight[plane]) + eps : T(1);
T _bias = affine ? bias[plane] : T(0);
T mul = rsqrt(_var + eps) * _weight;
for (int batch = 0; batch < num; ++batch) {
for (int n = threadIdx.x; n < sp; n += blockDim.x) {
T _x = x[(batch * chn + plane) * sp + n];
T _y = (_x - _mean) * mul + _bias;
x[(batch * chn + plane) * sp + n] = _y;
}
}
}
at::Tensor forward_cuda(at::Tensor x, at::Tensor mean, at::Tensor var, at::Tensor weight, at::Tensor bias,
bool affine, float eps) {
CHECK_CUDA_INPUT(x);
CHECK_CUDA_INPUT(mean);
CHECK_CUDA_INPUT(var);
CHECK_CUDA_INPUT(weight);
CHECK_CUDA_INPUT(bias);
// Extract dimensions
int64_t num, chn, sp;
get_dims(x, num, chn, sp);
// Run kernel
dim3 blocks(chn);
dim3 threads(getNumThreads(sp));
auto stream = at::cuda::getCurrentCUDAStream();
AT_DISPATCH_FLOATING_TYPES(x.type(), "forward_cuda", ([&] {
forward_kernel<scalar_t><<<blocks, threads, 0, stream>>>(
x.data<scalar_t>(),
mean.data<scalar_t>(),
var.data<scalar_t>(),
weight.data<scalar_t>(),
bias.data<scalar_t>(),
affine, eps, num, chn, sp);
}));
return x;
}
/***********
* edz_eydz
***********/
template<typename T>
__global__ void edz_eydz_kernel(const T *z, const T *dz, const T *weight, const T *bias,
T *edz, T *eydz, bool affine, float eps, int num, int chn, int sp) {
int plane = blockIdx.x;
T _weight = affine ? abs(weight[plane]) + eps : 1.f;
T _bias = affine ? bias[plane] : 0.f;
Pair<T> res = reduce<Pair<T>, GradOp<T>>(GradOp<T>(_weight, _bias, z, dz, chn, sp), plane, num, sp);
__syncthreads();
if (threadIdx.x == 0) {
edz[plane] = res.v1;
eydz[plane] = res.v2;
}
}
std::vector<at::Tensor> edz_eydz_cuda(at::Tensor z, at::Tensor dz, at::Tensor weight, at::Tensor bias,
bool affine, float eps) {
CHECK_CUDA_INPUT(z);
CHECK_CUDA_INPUT(dz);
CHECK_CUDA_INPUT(weight);
CHECK_CUDA_INPUT(bias);
// Extract dimensions
int64_t num, chn, sp;
get_dims(z, num, chn, sp);
auto edz = at::empty({chn}, z.options());
auto eydz = at::empty({chn}, z.options());
// Run kernel
dim3 blocks(chn);
dim3 threads(getNumThreads(sp));
auto stream = at::cuda::getCurrentCUDAStream();
AT_DISPATCH_FLOATING_TYPES(z.type(), "edz_eydz_cuda", ([&] {
edz_eydz_kernel<scalar_t><<<blocks, threads, 0, stream>>>(
z.data<scalar_t>(),
dz.data<scalar_t>(),
weight.data<scalar_t>(),
bias.data<scalar_t>(),
edz.data<scalar_t>(),
eydz.data<scalar_t>(),
affine, eps, num, chn, sp);
}));
return {edz, eydz};
}
/***********
* backward
***********/
template<typename T>
__global__ void backward_kernel(const T *z, const T *dz, const T *var, const T *weight, const T *bias, const T *edz,
const T *eydz, T *dx, bool affine, float eps, int num, int chn, int sp) {
int plane = blockIdx.x;
T _weight = affine ? abs(weight[plane]) + eps : 1.f;
T _bias = affine ? bias[plane] : 0.f;
T _var = var[plane];
T _edz = edz[plane];
T _eydz = eydz[plane];
T _mul = _weight * rsqrt(_var + eps);
T count = T(num * sp);
for (int batch = 0; batch < num; ++batch) {
for (int n = threadIdx.x; n < sp; n += blockDim.x) {
T _dz = dz[(batch * chn + plane) * sp + n];
T _y = (z[(batch * chn + plane) * sp + n] - _bias) / _weight;
dx[(batch * chn + plane) * sp + n] = (_dz - _edz / count - _y * _eydz / count) * _mul;
}
}
}
at::Tensor backward_cuda(at::Tensor z, at::Tensor dz, at::Tensor var, at::Tensor weight, at::Tensor bias,
at::Tensor edz, at::Tensor eydz, bool affine, float eps) {
CHECK_CUDA_INPUT(z);
CHECK_CUDA_INPUT(dz);
CHECK_CUDA_INPUT(var);
CHECK_CUDA_INPUT(weight);
CHECK_CUDA_INPUT(bias);
CHECK_CUDA_INPUT(edz);
CHECK_CUDA_INPUT(eydz);
// Extract dimensions
int64_t num, chn, sp;
get_dims(z, num, chn, sp);
auto dx = at::zeros_like(z);
// Run kernel
dim3 blocks(chn);
dim3 threads(getNumThreads(sp));
auto stream = at::cuda::getCurrentCUDAStream();
AT_DISPATCH_FLOATING_TYPES(z.type(), "backward_cuda", ([&] {
backward_kernel<scalar_t><<<blocks, threads, 0, stream>>>(
z.data<scalar_t>(),
dz.data<scalar_t>(),
var.data<scalar_t>(),
weight.data<scalar_t>(),
bias.data<scalar_t>(),
edz.data<scalar_t>(),
eydz.data<scalar_t>(),
dx.data<scalar_t>(),
affine, eps, num, chn, sp);
}));
return dx;
}
/**************
* activations
**************/
template<typename T>
inline void leaky_relu_backward_impl(T *z, T *dz, float slope, int64_t count) {
// Create thrust pointers
thrust::device_ptr<T> th_z = thrust::device_pointer_cast(z);
thrust::device_ptr<T> th_dz = thrust::device_pointer_cast(dz);
auto stream = at::cuda::getCurrentCUDAStream();
thrust::transform_if(thrust::cuda::par.on(stream),
th_dz, th_dz + count, th_z, th_dz,
[slope] __device__ (const T& dz) { return dz * slope; },
[] __device__ (const T& z) { return z < 0; });
thrust::transform_if(thrust::cuda::par.on(stream),
th_z, th_z + count, th_z,
[slope] __device__ (const T& z) { return z / slope; },
[] __device__ (const T& z) { return z < 0; });
}
void leaky_relu_backward_cuda(at::Tensor z, at::Tensor dz, float slope) {
CHECK_CUDA_INPUT(z);
CHECK_CUDA_INPUT(dz);
int64_t count = z.numel();
AT_DISPATCH_FLOATING_TYPES(z.type(), "leaky_relu_backward_cuda", ([&] {
leaky_relu_backward_impl<scalar_t>(z.data<scalar_t>(), dz.data<scalar_t>(), slope, count);
}));
}
template<typename T>
inline void elu_backward_impl(T *z, T *dz, int64_t count) {
// Create thrust pointers
thrust::device_ptr<T> th_z = thrust::device_pointer_cast(z);
thrust::device_ptr<T> th_dz = thrust::device_pointer_cast(dz);
auto stream = at::cuda::getCurrentCUDAStream();
thrust::transform_if(thrust::cuda::par.on(stream),
th_dz, th_dz + count, th_z, th_z, th_dz,
[] __device__ (const T& dz, const T& z) { return dz * (z + 1.); },
[] __device__ (const T& z) { return z < 0; });
thrust::transform_if(thrust::cuda::par.on(stream),
th_z, th_z + count, th_z,
[] __device__ (const T& z) { return log1p(z); },
[] __device__ (const T& z) { return z < 0; });
}
void elu_backward_cuda(at::Tensor z, at::Tensor dz) {
CHECK_CUDA_INPUT(z);
CHECK_CUDA_INPUT(dz);
int64_t count = z.numel();
AT_DISPATCH_FLOATING_TYPES(z.type(), "leaky_relu_backward_cuda", ([&] {
elu_backward_impl<scalar_t>(z.data<scalar_t>(), dz.data<scalar_t>(), count);
}));
}

View File

@@ -0,0 +1,275 @@
#include <ATen/ATen.h>
#include <cuda_fp16.h>
#include <vector>
#include "utils/checks.h"
#include "utils/cuda.cuh"
#include "inplace_abn.h"
#include <ATen/cuda/CUDAContext.h>
// Operations for reduce
struct SumOpH {
__device__ SumOpH(const half *t, int c, int s)
: tensor(t), chn(c), sp(s) {}
__device__ __forceinline__ float operator()(int batch, int plane, int n) {
return __half2float(tensor[(batch * chn + plane) * sp + n]);
}
const half *tensor;
const int chn;
const int sp;
};
struct VarOpH {
__device__ VarOpH(float m, const half *t, int c, int s)
: mean(m), tensor(t), chn(c), sp(s) {}
__device__ __forceinline__ float operator()(int batch, int plane, int n) {
const auto t = __half2float(tensor[(batch * chn + plane) * sp + n]);
return (t - mean) * (t - mean);
}
const float mean;
const half *tensor;
const int chn;
const int sp;
};
struct GradOpH {
__device__ GradOpH(float _weight, float _bias, const half *_z, const half *_dz, int c, int s)
: weight(_weight), bias(_bias), z(_z), dz(_dz), chn(c), sp(s) {}
__device__ __forceinline__ Pair<float> operator()(int batch, int plane, int n) {
float _y = (__half2float(z[(batch * chn + plane) * sp + n]) - bias) / weight;
float _dz = __half2float(dz[(batch * chn + plane) * sp + n]);
return Pair<float>(_dz, _y * _dz);
}
const float weight;
const float bias;
const half *z;
const half *dz;
const int chn;
const int sp;
};
/***********
* mean_var
***********/
__global__ void mean_var_kernel_h(const half *x, float *mean, float *var, int num, int chn, int sp) {
int plane = blockIdx.x;
float norm = 1.f / static_cast<float>(num * sp);
float _mean = reduce<float, SumOpH>(SumOpH(x, chn, sp), plane, num, sp) * norm;
__syncthreads();
float _var = reduce<float, VarOpH>(VarOpH(_mean, x, chn, sp), plane, num, sp) * norm;
if (threadIdx.x == 0) {
mean[plane] = _mean;
var[plane] = _var;
}
}
std::vector<at::Tensor> mean_var_cuda_h(at::Tensor x) {
CHECK_CUDA_INPUT(x);
// Extract dimensions
int64_t num, chn, sp;
get_dims(x, num, chn, sp);
// Prepare output tensors
auto mean = at::empty({chn},x.options().dtype(at::kFloat));
auto var = at::empty({chn},x.options().dtype(at::kFloat));
// Run kernel
dim3 blocks(chn);
dim3 threads(getNumThreads(sp));
auto stream = at::cuda::getCurrentCUDAStream();
mean_var_kernel_h<<<blocks, threads, 0, stream>>>(
reinterpret_cast<half*>(x.data<at::Half>()),
mean.data<float>(),
var.data<float>(),
num, chn, sp);
return {mean, var};
}
/**********
* forward
**********/
__global__ void forward_kernel_h(half *x, const float *mean, const float *var, const float *weight, const float *bias,
bool affine, float eps, int num, int chn, int sp) {
int plane = blockIdx.x;
const float _mean = mean[plane];
const float _var = var[plane];
const float _weight = affine ? abs(weight[plane]) + eps : 1.f;
const float _bias = affine ? bias[plane] : 0.f;
const float mul = rsqrt(_var + eps) * _weight;
for (int batch = 0; batch < num; ++batch) {
for (int n = threadIdx.x; n < sp; n += blockDim.x) {
half *x_ptr = x + (batch * chn + plane) * sp + n;
float _x = __half2float(*x_ptr);
float _y = (_x - _mean) * mul + _bias;
*x_ptr = __float2half(_y);
}
}
}
at::Tensor forward_cuda_h(at::Tensor x, at::Tensor mean, at::Tensor var, at::Tensor weight, at::Tensor bias,
bool affine, float eps) {
CHECK_CUDA_INPUT(x);
CHECK_CUDA_INPUT(mean);
CHECK_CUDA_INPUT(var);
CHECK_CUDA_INPUT(weight);
CHECK_CUDA_INPUT(bias);
// Extract dimensions
int64_t num, chn, sp;
get_dims(x, num, chn, sp);
// Run kernel
dim3 blocks(chn);
dim3 threads(getNumThreads(sp));
auto stream = at::cuda::getCurrentCUDAStream();
forward_kernel_h<<<blocks, threads, 0, stream>>>(
reinterpret_cast<half*>(x.data<at::Half>()),
mean.data<float>(),
var.data<float>(),
weight.data<float>(),
bias.data<float>(),
affine, eps, num, chn, sp);
return x;
}
__global__ void edz_eydz_kernel_h(const half *z, const half *dz, const float *weight, const float *bias,
float *edz, float *eydz, bool affine, float eps, int num, int chn, int sp) {
int plane = blockIdx.x;
float _weight = affine ? abs(weight[plane]) + eps : 1.f;
float _bias = affine ? bias[plane] : 0.f;
Pair<float> res = reduce<Pair<float>, GradOpH>(GradOpH(_weight, _bias, z, dz, chn, sp), plane, num, sp);
__syncthreads();
if (threadIdx.x == 0) {
edz[plane] = res.v1;
eydz[plane] = res.v2;
}
}
std::vector<at::Tensor> edz_eydz_cuda_h(at::Tensor z, at::Tensor dz, at::Tensor weight, at::Tensor bias,
bool affine, float eps) {
CHECK_CUDA_INPUT(z);
CHECK_CUDA_INPUT(dz);
CHECK_CUDA_INPUT(weight);
CHECK_CUDA_INPUT(bias);
// Extract dimensions
int64_t num, chn, sp;
get_dims(z, num, chn, sp);
auto edz = at::empty({chn},z.options().dtype(at::kFloat));
auto eydz = at::empty({chn},z.options().dtype(at::kFloat));
// Run kernel
dim3 blocks(chn);
dim3 threads(getNumThreads(sp));
auto stream = at::cuda::getCurrentCUDAStream();
edz_eydz_kernel_h<<<blocks, threads, 0, stream>>>(
reinterpret_cast<half*>(z.data<at::Half>()),
reinterpret_cast<half*>(dz.data<at::Half>()),
weight.data<float>(),
bias.data<float>(),
edz.data<float>(),
eydz.data<float>(),
affine, eps, num, chn, sp);
return {edz, eydz};
}
__global__ void backward_kernel_h(const half *z, const half *dz, const float *var, const float *weight, const float *bias, const float *edz,
const float *eydz, half *dx, bool affine, float eps, int num, int chn, int sp) {
int plane = blockIdx.x;
float _weight = affine ? abs(weight[plane]) + eps : 1.f;
float _bias = affine ? bias[plane] : 0.f;
float _var = var[plane];
float _edz = edz[plane];
float _eydz = eydz[plane];
float _mul = _weight * rsqrt(_var + eps);
float count = float(num * sp);
for (int batch = 0; batch < num; ++batch) {
for (int n = threadIdx.x; n < sp; n += blockDim.x) {
float _dz = __half2float(dz[(batch * chn + plane) * sp + n]);
float _y = (__half2float(z[(batch * chn + plane) * sp + n]) - _bias) / _weight;
dx[(batch * chn + plane) * sp + n] = __float2half((_dz - _edz / count - _y * _eydz / count) * _mul);
}
}
}
at::Tensor backward_cuda_h(at::Tensor z, at::Tensor dz, at::Tensor var, at::Tensor weight, at::Tensor bias,
at::Tensor edz, at::Tensor eydz, bool affine, float eps) {
CHECK_CUDA_INPUT(z);
CHECK_CUDA_INPUT(dz);
CHECK_CUDA_INPUT(var);
CHECK_CUDA_INPUT(weight);
CHECK_CUDA_INPUT(bias);
CHECK_CUDA_INPUT(edz);
CHECK_CUDA_INPUT(eydz);
// Extract dimensions
int64_t num, chn, sp;
get_dims(z, num, chn, sp);
auto dx = at::zeros_like(z);
// Run kernel
dim3 blocks(chn);
dim3 threads(getNumThreads(sp));
auto stream = at::cuda::getCurrentCUDAStream();
backward_kernel_h<<<blocks, threads, 0, stream>>>(
reinterpret_cast<half*>(z.data<at::Half>()),
reinterpret_cast<half*>(dz.data<at::Half>()),
var.data<float>(),
weight.data<float>(),
bias.data<float>(),
edz.data<float>(),
eydz.data<float>(),
reinterpret_cast<half*>(dx.data<at::Half>()),
affine, eps, num, chn, sp);
return dx;
}
__global__ void leaky_relu_backward_impl_h(half *z, half *dz, float slope, int64_t count) {
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < count; i += blockDim.x * gridDim.x){
float _z = __half2float(z[i]);
if (_z < 0) {
dz[i] = __float2half(__half2float(dz[i]) * slope);
z[i] = __float2half(_z / slope);
}
}
}
void leaky_relu_backward_cuda_h(at::Tensor z, at::Tensor dz, float slope) {
CHECK_CUDA_INPUT(z);
CHECK_CUDA_INPUT(dz);
int64_t count = z.numel();
dim3 threads(getNumThreads(count));
dim3 blocks = (count + threads.x - 1) / threads.x;
auto stream = at::cuda::getCurrentCUDAStream();
leaky_relu_backward_impl_h<<<blocks, threads, 0, stream>>>(
reinterpret_cast<half*>(z.data<at::Half>()),
reinterpret_cast<half*>(dz.data<at::Half>()),
slope, count);
}

View File

@@ -0,0 +1,15 @@
#pragma once
#include <ATen/ATen.h>
// Define AT_CHECK for old version of ATen where the same function was called AT_ASSERT
#ifndef AT_CHECK
#define AT_CHECK AT_ASSERT
#endif
#define CHECK_CUDA(x) AT_CHECK((x).type().is_cuda(), #x " must be a CUDA tensor")
#define CHECK_CPU(x) AT_CHECK(!(x).type().is_cuda(), #x " must be a CPU tensor")
#define CHECK_CONTIGUOUS(x) AT_CHECK((x).is_contiguous(), #x " must be contiguous")
#define CHECK_CUDA_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x)
#define CHECK_CPU_INPUT(x) CHECK_CPU(x); CHECK_CONTIGUOUS(x)

View File

@@ -0,0 +1,49 @@
#pragma once
#include <ATen/ATen.h>
/*
* Functions to share code between CPU and GPU
*/
#ifdef __CUDACC__
// CUDA versions
#define HOST_DEVICE __host__ __device__
#define INLINE_HOST_DEVICE __host__ __device__ inline
#define FLOOR(x) floor(x)
#if __CUDA_ARCH__ >= 600
// Recent compute capabilities have block-level atomicAdd for all data types, so we use that
#define ACCUM(x,y) atomicAdd_block(&(x),(y))
#else
// Older architectures don't have block-level atomicAdd, nor atomicAdd for doubles, so we defer to atomicAdd for float
// and use the known atomicCAS-based implementation for double
template<typename data_t>
__device__ inline data_t atomic_add(data_t *address, data_t val) {
return atomicAdd(address, val);
}
template<>
__device__ inline double atomic_add(double *address, double val) {
unsigned long long int* address_as_ull = (unsigned long long int*)address;
unsigned long long int old = *address_as_ull, assumed;
do {
assumed = old;
old = atomicCAS(address_as_ull, assumed, __double_as_longlong(val + __longlong_as_double(assumed)));
} while (assumed != old);
return __longlong_as_double(old);
}
#define ACCUM(x,y) atomic_add(&(x),(y))
#endif // #if __CUDA_ARCH__ >= 600
#else
// CPU versions
#define HOST_DEVICE
#define INLINE_HOST_DEVICE inline
#define FLOOR(x) std::floor(x)
#define ACCUM(x,y) (x) += (y)
#endif // #ifdef __CUDACC__

View File

@@ -0,0 +1,71 @@
#pragma once
/*
* General settings and functions
*/
const int WARP_SIZE = 32;
const int MAX_BLOCK_SIZE = 1024;
static int getNumThreads(int nElem) {
int threadSizes[6] = {32, 64, 128, 256, 512, MAX_BLOCK_SIZE};
for (int i = 0; i < 6; ++i) {
if (nElem <= threadSizes[i]) {
return threadSizes[i];
}
}
return MAX_BLOCK_SIZE;
}
/*
* Reduction utilities
*/
template <typename T>
__device__ __forceinline__ T WARP_SHFL_XOR(T value, int laneMask, int width = warpSize,
unsigned int mask = 0xffffffff) {
#if CUDART_VERSION >= 9000
return __shfl_xor_sync(mask, value, laneMask, width);
#else
return __shfl_xor(value, laneMask, width);
#endif
}
__device__ __forceinline__ int getMSB(int val) { return 31 - __clz(val); }
template<typename T>
struct Pair {
T v1, v2;
__device__ Pair() {}
__device__ Pair(T _v1, T _v2) : v1(_v1), v2(_v2) {}
__device__ Pair(T v) : v1(v), v2(v) {}
__device__ Pair(int v) : v1(v), v2(v) {}
__device__ Pair &operator+=(const Pair<T> &a) {
v1 += a.v1;
v2 += a.v2;
return *this;
}
};
template<typename T>
static __device__ __forceinline__ T warpSum(T val) {
#if __CUDA_ARCH__ >= 300
for (int i = 0; i < getMSB(WARP_SIZE); ++i) {
val += WARP_SHFL_XOR(val, 1 << i, WARP_SIZE);
}
#else
__shared__ T values[MAX_BLOCK_SIZE];
values[threadIdx.x] = val;
__threadfence_block();
const int base = (threadIdx.x / WARP_SIZE) * WARP_SIZE;
for (int i = 1; i < WARP_SIZE; i++) {
val += values[base + ((i + threadIdx.x) % WARP_SIZE)];
}
#endif
return val;
}
template<typename T>
static __device__ __forceinline__ Pair<T> warpSum(Pair<T> value) {
value.v1 = warpSum(value.v1);
value.v2 = warpSum(value.v2);
return value;
}