下面介绍了根据构建的Rulebook执行具体稀疏卷积计算,继续看类SparseConvolution
,代码位于:spconv/conv.py
class SparseConvolution(SparseModule):
__constants__ = [
'stride', 'padding', 'dilation', 'groups', 'bias', 'subm', 'inverse',
'transposed', 'output_padding', 'fused_bn'
]
def __init__(self,
ndim, # 数据特征维度,SubMConv3d和SparseConv3d的ndim为3
in_channels, # 输入通道
out_channels, # 输出通道
kernel_size=3, # 卷积核尺寸
stride=1, # 步长
padding=0, # 填充值
dilation=1, # 空洞卷积:卷积核各个元素之间的间隔,默认卷积方式dilation=1
groups=1, # 深度可分离卷积:groups参数将 in_channel和out_channel 按照次序分别分成了一 一对应的groups组,默认为1
bias=True, # 偏置
subm=False, # 区分是标准3d稀疏卷积还是3d子流行稀疏卷积
output_padding=0, # 输出填充,默认为0
transposed=False, # 转置卷积,默认为False
inverse=False, # 反卷积,默认为False
indice_key=None, # 索引key,子流线卷积不会改变输入输出位置索引以及输出特征图空间形状,可以字典存储起来直接利用
fused_bn=False, # conv和bn融合
use_hash=False, # 分区使用cpu和gpu计算卷积,默认为False,使用gpu
algo=ops.ConvAlgo.Native): # 3种内存分配方式,值为0,1,2,默认0
super(SparseConvolution, self).__init__()
assert groups == 1
if not isinstance(kernel_size, (list, tuple)):
kernel_size = [kernel_size] * ndim
if not isinstance(stride, (list, tuple)):
stride = [stride] * ndim
if not isinstance(padding, (list, tuple)):
padding = [padding] * ndim
if not isinstance(dilation, (list, tuple)):
dilation = [dilation] * ndim
if not isinstance(output_padding, (list, tuple)):
output_padding = [output_padding] * ndim
for d, s in zip(dilation, stride): # 必须有一个为1
assert any([s == 1, d == 1]), "don't support this."
self.ndim = ndim # 3d稀疏卷积ndim为3
self.in_channels = in_channels # 5
self.out_channels = out_channels # 16
self.kernel_size = kernel_size # 3
self.conv1x1 = np.prod(kernel_size) == 1 # 计算所有元素的乘积
self.stride = stride # 1
self.padding = padding # 1
self.dilation = dilation # 空洞卷积:卷积核各个元素之间的间隔,默认卷积方式dilation=1
self.transposed = transposed # False
self.inverse = inverse # False
self.output_padding = output_padding # 0
self.groups = groups # 深度可分离卷积:groups参数将 in_channel和out_channel 按照次序分别分成了一 一对应的groups组,默认为1
self.subm = subm # False 用于区分是标准3d稀疏卷积还是3d子流行稀疏卷积
self.indice_key = indice_key # 索引key,子流线卷积不会改变输入输出位置索引以及输出特征图空间形状,可以存储起来直接利用
self.fused_bn = fused_bn # conv和bn融合
self.use_hash = use_hash # cpu版利用哈希
self.algo = algo.value # 获取枚举标签的值,0,1,2分别对应3中内存分配方式
# torch.nn.Parameter()将一个不可训练的tensor转换成可以训练的类型parameter,并将这个parameter绑定到这个module里面
# 使用这个函数的目的也是想让某些变量在学习的过程中不断的修改其值以达到最优化。
# 27,16,32
self.weight = Parameter(
torch.Tensor(*kernel_size, in_channels, out_channels))
if bias:
self.bias = Parameter(torch.Tensor(out_channels))
else:
# 将bias参数添加到模块中,通过使用给定名字可以访问该参数.
self.register_parameter('bias', None)
self.reset_parameters()
def reset_parameters(self):
n = self.in_channels
init.kaiming_uniform_(self.weight, a=math.sqrt(5))
if self.bias is not None:
fan_in, _ = _calculate_fan_in_and_fan_out_hwio(self.weight)
bound = 1 / math.sqrt(fan_in)
init.uniform_(self.bias, -bound, bound)
def forward(self, input):
assert isinstance(input, spconv.SparseConvTensor)
features = input.features # (N,5)
device = features.device # cuda:0
indices = input.indices # (N, 4) 网格坐标(batch_id,z,y,x)
spatial_shape = input.spatial_shape # [41, 1440, 1440]
batch_size = input.batch_size # 4
# 1.计算输出空间形状
if not self.subm: # 普通3d稀疏卷积
if self.transposed: # False
out_spatial_shape = ops.get_deconv_output_size(
spatial_shape, self.kernel_size, self.stride, self.padding,self.dilation, self.output_padding)
else:
# 子流线稀疏卷积计算输出卷积的形状 [41, 1440, 1440]-->[21, 720, 720]
out_spatial_shape = ops.get_conv_output_size(
spatial_shape, self.kernel_size, self.stride, self.padding,self.dilation)
else: # SubMConv 卷积
# 子流线卷积输入和输出形状相同
out_spatial_shape = spatial_shape # [41, 1440, 1440]
# input.update_grid(out_spatial_shape)
# t = time.time()
if self.conv1x1: # 单独处理1x1卷积
features = torch.mm(
input.features,
self.weight.view(self.in_channels, self.out_channels))
if self.bias is not None:
features += self.bias
out_tensor = spconv.SparseConvTensor(features, input.indices,input.spatial_shape,input.batch_size)
out_tensor.indice_dict = input.indice_dict
out_tensor.grid = input.grid
return out_tensor
# input为SparseConvTensor,因为submconv3d不会改变输入输出位置索引以及输出特征图空间形状
# 如果本层的outids, indice_pairs, indice_pair_num, out_spatial_shape等与之前计算过的层相同,这里用字典存储,后面直接使用,避免重复计算
datas = input.find_indice_pair(self.indice_key)
# 2.构建Rulebook,直接使用Python调用c++接口
if self.inverse: # False 处理反卷积
assert datas is not None and self.indice_key is not None
_, outids, indice_pairs, indice_pair_num, out_spatial_shape = datas
assert indice_pair_num.shape[0] == np.prod(self.kernel_size), "inverse conv must have same kernel size as its couple conv"
else: # 非反卷积
# 如:self.indice_key = 'subm1' and datas = None
if self.indice_key is not None and datas is not None:
outids, _, indice_pairs, indice_pair_num, _ = datas
else:
# indices:[N, 4], 就是input的indices的属性,即voxel网格坐标索引
# outids: [N, 4],由于submconv性质,outids和 incides 是一样的,如果是标准的spconv,就不一样了
# indice_pairs: [2, 27, N],2是对应关系,2表示输入和输出两个方向,第0位储存输入indices的下标,第1位储存输出outids中的下标,27 为卷积核的volume 3x3x3,N 表示输入有效(active)特征的数量
# indice_pair_num: [27],用于保存卷积核每一个位置上的总的计算的次数,因为是稀疏卷积卷积核上每一个元素和有效数据的运算次数可能是不同的
# return {indices.to(device), indicePairs.to(device),indiceNum.to(device)};
outids, indice_pairs, indice_pair_num = ops.get_indice_pairs(
indices, # (N, 4)
batch_size, # 4
spatial_shape, # [41, 1440, 1440]
self.kernel_size, # 3
self.stride,
self.padding,
self.dilation,
self.output_padding,
self.subm,
self.transposed,
grid=input.grid,
use_hash=self.use_hash)
# 将索引信息写入input的索引字典中
input.indice_dict[self.indice_key] = (outids,indices, indice_pairs,indice_pair_num,spatial_shape)
# 3.根据构建的Rulebook执行具体稀疏卷积计算
if self.fused_bn: # False
assert self.bias is not None
out_features = ops.fused_indice_conv(features, self.weight,self.bias,indice_pairs.to(device),indice_pair_num,outids.shape[0], self.inverse,self.subm)
else:
if self.subm: # 子流线稀疏卷积
# Fsp.indice_subm_conv和Fsp.indice_conv经function.py中的SubMConvFunction和SparseConvFunction对象辗转还是会继续调用ops模块中的indice_conv等函数。
# 最终,他们都会以torch.ops.spconv.xx的形式调用c++扩展共享库中的api来完成任务
out_features = Fsp.indice_subm_conv(features, # 输入特征(N,5)
self.weight, # 权重(27*16*32)
indice_pairs.to(device), # [2, 27, N]
indice_pair_num, # [27],用于保存卷积核每一个位置上的总的计算的次数
outids.shape[0], # N
self.algo # 获取枚举标签的值,0,1,2分别对应3中内存分配方式
)
else:
if self.inverse:
out_features = Fsp.indice_inverse_conv(features, self.weight, indice_pairs.to(device),indice_pair_num, outids.shape[0], self.algo)
else:
out_features = Fsp.indice_conv(features, self.weight,indice_pairs.to(device),indice_pair_num,outids.shape[0], self.algo)
if self.bias is not None:
out_features += self.bias
out_tensor = spconv.SparseConvTensor(out_features, outids,out_spatial_shape, batch_size)
out_tensor.indice_dict = input.indice_dict
out_tensor.grid = input.grid
return out_tensor
Fsp.indice_subm_conv和Fsp.indice_conv经spconv/functional.py中的SubMConvFunction和SparseConvFunction对象辗转还是会继续调用spconv/ops.py模块中的indice_conv等函数。
先看子流线卷积的接口:indice_subm_conv,代码:spconv/functional.py
class SubMConvFunction(Function):
@staticmethod
def forward(ctx, features, filters, indice_pairs, indice_pair_num,num_activate_out, algo):
ctx.save_for_backward(indice_pairs, indice_pair_num, features, filters)
ctx.algo = algo
return ops.indice_conv(features, # 输入特征(N,5)
filters, # 权重(27*16*32)
indice_pairs, # [2, 27, N]
indice_pair_num, # [27],用于保存卷积核每一个位置上的总的计算的次数
num_activate_out,# N 输出有效active个数
False, # 反卷积 false
True, # 子流线卷积默认 true
algo=algo # 获取枚举标签的值,0,1,2分别对应3中内存分配方式
)
@staticmethod
def backward(ctx, grad_output):
indice_pairs, indice_pair_num, features, filters = ctx.saved_tensors
input_bp, filters_bp = ops.indice_conv_backward(features,
filters,
grad_output,
indice_pairs,
indice_pair_num,
False,
True,
algo=ctx.algo)
return input_bp, filters_bp, None, None, None, None
indice_subm_conv = SubMConvFunction.apply
直接用Python
接口调用 C++
函数不太“美观”,这里用torch.autograd.Function
来封装算子的底层调用。
Function
类本身表示 PyTorch 的一个可导函数,只要为其定义了前向推理和反向传播的实现,我们就可以把它当成一个普通 PyTorch 函数来使用,PyTorch
会自动调度该函数,合适地执行前向和反向计算。
扩充:对模型部署来说,
Function
类有一个很好的性质:如果它定义了symbolic
静态方法,该Function
在执行torch.onnx.export()
时就可以根据symbolic
中定义的规则转换成ONNX
算子
apply
是torch.autograd.Function
的一个方法,这个方法完成了 Function
在前向推理或者反向传播时的调度,使用 indice_subm_conv = SubMConvFunction.apply
把这个调用方法取了一个更简短的别名 indice_subm_conv
,以后在使用 indice_subm_conv
算子时,我们可以忽略 SubMConvFunction
的实现细节,而只通过indice_subm_conv
这个接口来访问算子。
SubMConvFunction的前向传播forward调用spconv/ops.py
的indice_conv
方法
def indice_conv(features,
filters,
indice_pairs,
indice_pair_num,
num_activate_out,
inverse=False,
subm=False,
algo=ConvAlgo.Native.value):
return torch.ops.spconv.indice_conv(features, filters, indice_pairs,indice_pair_num, num_activate_out,int(inverse), int(subm), algo)
在src/spconv/all.cc
文件中通过Pytorch
提供的OP Register
对底层c++ api
进行了注册
static auto registry =
torch::RegisterOperators()
.op("spconv::get_indice_pairs", &spconv::getIndicePairs)
.op("spconv::indice_conv", &spconv::indiceConv)
通过torch.ops.load_library
加载.so
文件, torch.ops.spconv.indice_conv
方式来调用src/spconv/spconv_ops.cc
文件中的indiceConv
函数
下面具体看src/spconv/spconv_ops.cc
的indiceConv
函数:
torch::Tensor indiceConv(torch::Tensor features, // 输入特征(N,5)
torch::Tensor filters, // 权重(27*16*32)
torch::Tensor indicePairs,// [2, 27, N]
torch::Tensor indiceNum, // [27],用于保存卷积核每一个位置上的总的计算的次数
int64_t numActOut, // N 输出有效active个数
int64_t _inverse, // 反卷积 false
int64_t _subM, // 子流线卷积默认 true
int64_t algo // 获取枚举标签的值,0,1,2分别对应3中内存分配方式
) {
auto kernelVolume = indiceNum.size(0); // N
switch (algo) { // 默认0
case kBatchGemmGather: // 2
case kBatch: { // 1
if (kernelVolume != 1) {
return indiceConvBatch(features, filters, indicePairs, indiceNum,numActOut, _inverse, _subM,algo != kBatchGemmGather);
}
else {
break;
}
}
case kNative:
break;
default:
TV_THROW_RT_ERR("unknown algo");
}
// auto timer = spconv::CudaContextTimer<>();
bool subM = _subM != 0;
bool inverse = _inverse != 0;
auto device = features.device().type();
auto ndim = filters.dim() - 2;
auto numInPlanes = features.size(1);
auto numOutPlanes = filters.size(ndim + 1);
auto indicePairNumCpu = indiceNum.to({torch::kCPU});
auto options =
torch::TensorOptions().dtype(features.dtype()).device(features.device());
torch::Tensor output = torch::zeros({numActOut, numOutPlanes}, options);
filters = filters.view({-1, numInPlanes, numOutPlanes});
// init for subM
int indicePairMaxOffset = kernelVolume / 2;
int indicePairMaxSize = numActOut;
if (subM) { // the center index of subm conv don't need gather and scatter
// add.
torch::mm_out(output, features, filters[indicePairMaxOffset]);
// get indice pair second max size based on subM symmetric property
indicePairMaxSize =
*std::max_element(indicePairNumCpu.data_ptr<int>(),
indicePairNumCpu.data_ptr<int>() + indicePairMaxOffset);
if (indicePairMaxSize == 0) {
return output;
}
} else {
indicePairMaxSize =
*std::max_element(indicePairNumCpu.data_ptr<int>(),
indicePairNumCpu.data_ptr<int>() + kernelVolume);
}
torch::Tensor inputBuffer =
torch::empty({indicePairMaxSize, numInPlanes}, options);
torch::Tensor outputBuffer =
torch::empty({indicePairMaxSize, numOutPlanes}, options);
double totalGatherTime = 0;
double totalGEMMTime = 0;
double totalSAddTime = 0;
// tv::ssprint("first subm gemm time", timer.report() / 1000.0);
for (int i = 0; i < kernelVolume; ++i) {
auto nHot = indicePairNumCpu.data_ptr<int>()[i];
if (nHot <= 0 || (subM && i == indicePairMaxOffset)) {
continue;
}
// TODO torch::from_blob is a little slow
auto outputBufferBlob = torch::from_blob(outputBuffer.data_ptr(),
{nHot, numOutPlanes}, options);
auto inputBufferBlob =
torch::from_blob(inputBuffer.data_ptr(), {nHot, numInPlanes}, options);
if (device == torch::kCPU) {
sparse_gather_cpu(inputBuffer, features, indicePairs[inverse][i], nHot);
}
#ifdef TV_CUDA
else if (device == torch::kCUDA) {
sparse_gather_cuda(inputBuffer, features, indicePairs[inverse][i], nHot);
/* slower than SparseGatherFunctor, may due to int->long conversion
auto indicePairLong = indicePairs[i][inverse].to(torch::kInt64);
auto indicePairBlob = torch::from_blob(indicePairLong.data<long>(),
{nHot}, indicePairOptions); torch::index_select_out(inputBufferBlob,
features, 0, indicePairBlob);*/
}
#endif
else {
TV_THROW_INVALID_ARG("unknown device type");
}
// totalGatherTime += timer.report() / 1000.0;
torch::mm_out(outputBufferBlob, inputBufferBlob, filters[i]);
// totalGEMMTime += timer.report() / 1000.0;
if (device == torch::kCPU) {
sparse_scatter_add_cpu(outputBuffer, output, indicePairs[!inverse][i],
nHot);
}
#ifdef TV_CUDA
else if (device == torch::kCUDA) {
sparse_scatter_add_cuda(outputBuffer, output, indicePairs[!inverse][i],
nHot);
}
#endif
else {
TV_THROW_INVALID_ARG("unknown device type");
}
// totalSAddTime += timer.report() / 1000.0;
}
// tv::ssprint(totalGatherTime, totalGEMMTime, totalSAddTime);
return output;
}
代写!!
评论(0)
您还未登录,请登录后发表或查看评论