下面介绍了根据构建的Rulebook执行具体稀疏卷积计算,继续看类SparseConvolution,代码位于:spconv/conv.py

class SparseConvolution(SparseModule):
    __constants__ = [
        'stride', 'padding', 'dilation', 'groups', 'bias', 'subm', 'inverse',
        'transposed', 'output_padding', 'fused_bn'
    ]

    def __init__(self,
                 ndim,              #  数据特征维度,SubMConv3d和SparseConv3d的ndim为3
                 in_channels,       # 输入通道
                 out_channels,      # 输出通道
                 kernel_size=3,     # 卷积核尺寸
                 stride=1,          # 步长
                 padding=0,         # 填充值
                 dilation=1,        # 空洞卷积:卷积核各个元素之间的间隔,默认卷积方式dilation=1
                 groups=1,          # 深度可分离卷积:groups参数将 in_channel和out_channel 按照次序分别分成了一 一对应的groups组,默认为1
                 bias=True,         # 偏置
                 subm=False,        # 区分是标准3d稀疏卷积还是3d子流行稀疏卷积
                 output_padding=0,  # 输出填充,默认为0
                 transposed=False,  # 转置卷积,默认为False
                 inverse=False,     # 反卷积,默认为False
                 indice_key=None,   # 索引key,子流线卷积不会改变输入输出位置索引以及输出特征图空间形状,可以字典存储起来直接利用
                 fused_bn=False,    # conv和bn融合
                 use_hash=False,    # 分区使用cpu和gpu计算卷积,默认为False,使用gpu
                 algo=ops.ConvAlgo.Native): # 3种内存分配方式,值为0,1,2,默认0
        super(SparseConvolution, self).__init__()
        assert groups == 1
        if not isinstance(kernel_size, (list, tuple)):
            kernel_size = [kernel_size] * ndim
        if not isinstance(stride, (list, tuple)):
            stride = [stride] * ndim
        if not isinstance(padding, (list, tuple)):
            padding = [padding] * ndim
        if not isinstance(dilation, (list, tuple)):
            dilation = [dilation] * ndim
        if not isinstance(output_padding, (list, tuple)):
            output_padding = [output_padding] * ndim

        for d, s in zip(dilation, stride): # 必须有一个为1
            assert any([s == 1, d == 1]), "don't support this."

        self.ndim = ndim #  3d稀疏卷积ndim为3
        self.in_channels = in_channels # 5
        self.out_channels = out_channels # 16
        self.kernel_size = kernel_size # 3
        self.conv1x1 = np.prod(kernel_size) == 1 # 计算所有元素的乘积
        self.stride = stride # 1
        self.padding = padding # 1
        self.dilation = dilation # 空洞卷积:卷积核各个元素之间的间隔,默认卷积方式dilation=1
        self.transposed = transposed # False 
        self.inverse = inverse # False 
        self.output_padding = output_padding # 0
        self.groups = groups # 深度可分离卷积:groups参数将 in_channel和out_channel 按照次序分别分成了一 一对应的groups组,默认为1
        self.subm = subm # False 用于区分是标准3d稀疏卷积还是3d子流行稀疏卷积
        self.indice_key = indice_key # 索引key,子流线卷积不会改变输入输出位置索引以及输出特征图空间形状,可以存储起来直接利用
        self.fused_bn = fused_bn # conv和bn融合
        self.use_hash = use_hash # cpu版利用哈希
        self.algo = algo.value # 获取枚举标签的值,0,1,2分别对应3中内存分配方式
        # torch.nn.Parameter()将一个不可训练的tensor转换成可以训练的类型parameter,并将这个parameter绑定到这个module里面
        # 使用这个函数的目的也是想让某些变量在学习的过程中不断的修改其值以达到最优化。
        # 27,16,32
        self.weight = Parameter(
            torch.Tensor(*kernel_size, in_channels, out_channels))
        if bias:
            self.bias = Parameter(torch.Tensor(out_channels))
        else:
            # 将bias参数添加到模块中,通过使用给定名字可以访问该参数.
            self.register_parameter('bias', None)
        self.reset_parameters()

    def reset_parameters(self):
        n = self.in_channels
        init.kaiming_uniform_(self.weight, a=math.sqrt(5))
        if self.bias is not None:
            fan_in, _ = _calculate_fan_in_and_fan_out_hwio(self.weight)
            bound = 1 / math.sqrt(fan_in)
            init.uniform_(self.bias, -bound, bound)

    def forward(self, input):
        assert isinstance(input, spconv.SparseConvTensor)
        features = input.features # (N,5)
        device = features.device # cuda:0
        indices = input.indices # (N, 4) 网格坐标(batch_id,z,y,x)
        spatial_shape = input.spatial_shape # [41, 1440, 1440] 
        batch_size = input.batch_size # 4

        # 1.计算输出空间形状
        if not self.subm: # 普通3d稀疏卷积
            if self.transposed: # False
                out_spatial_shape = ops.get_deconv_output_size(
                    spatial_shape, self.kernel_size, self.stride, self.padding,self.dilation, self.output_padding)
            else:
                # 子流线稀疏卷积计算输出卷积的形状 [41, 1440, 1440]-->[21, 720, 720]
                out_spatial_shape = ops.get_conv_output_size(
                    spatial_shape, self.kernel_size, self.stride, self.padding,self.dilation)
        else: # SubMConv 卷积
              # 子流线卷积输入和输出形状相同
            out_spatial_shape = spatial_shape # [41, 1440, 1440] 
        # input.update_grid(out_spatial_shape)
        # t = time.time()
        if self.conv1x1: # 单独处理1x1卷积
            features = torch.mm(
                input.features,
                self.weight.view(self.in_channels, self.out_channels))
            if self.bias is not None:
                features += self.bias
            out_tensor = spconv.SparseConvTensor(features, input.indices,input.spatial_shape,input.batch_size)
            out_tensor.indice_dict = input.indice_dict
            out_tensor.grid = input.grid
            return out_tensor
        # input为SparseConvTensor,因为submconv3d不会改变输入输出位置索引以及输出特征图空间形状
        # 如果本层的outids, indice_pairs, indice_pair_num, out_spatial_shape等与之前计算过的层相同,这里用字典存储,后面直接使用,避免重复计算
        datas = input.find_indice_pair(self.indice_key)  

        # 2.构建Rulebook,直接使用Python调用c++接口
        if self.inverse: # False 处理反卷积
            assert datas is not None and self.indice_key is not None
            _, outids, indice_pairs, indice_pair_num, out_spatial_shape = datas
            assert indice_pair_num.shape[0] == np.prod(self.kernel_size), "inverse conv must have same kernel size as its couple conv"
        else: # 非反卷积
              # 如:self.indice_key = 'subm1' and datas = None
            if self.indice_key is not None and datas is not None:
                outids, _, indice_pairs, indice_pair_num, _ = datas
            else:
                # indices:[N, 4], 就是input的indices的属性,即voxel网格坐标索引
            	# outids: [N, 4],由于submconv性质,outids和 incides 是一样的,如果是标准的spconv,就不一样了
            	# indice_pairs: [2, 27, N],2是对应关系,2表示输入和输出两个方向,第0位储存输入indices的下标,第1位储存输出outids中的下标,27 为卷积核的volume 3x3x3,N 表示输入有效(active)特征的数量
            	# indice_pair_num: [27],用于保存卷积核每一个位置上的总的计算的次数,因为是稀疏卷积卷积核上每一个元素和有效数据的运算次数可能是不同的
                # return {indices.to(device), indicePairs.to(device),indiceNum.to(device)};
                outids, indice_pairs, indice_pair_num = ops.get_indice_pairs(
                    indices,            # (N, 4)
                    batch_size,         # 4
                    spatial_shape,      # [41, 1440, 1440]
                    self.kernel_size,   # 3
                    self.stride,        
                    self.padding,
                    self.dilation,
                    self.output_padding,
                    self.subm,
                    self.transposed,
                    grid=input.grid,
                    use_hash=self.use_hash)
                # 将索引信息写入input的索引字典中
                input.indice_dict[self.indice_key] = (outids,indices, indice_pairs,indice_pair_num,spatial_shape)
        
        # 3.根据构建的Rulebook执行具体稀疏卷积计算
        if self.fused_bn: # False
            assert self.bias is not None
            out_features = ops.fused_indice_conv(features, self.weight,self.bias,indice_pairs.to(device),indice_pair_num,outids.shape[0], self.inverse,self.subm)
        else:
            if self.subm: # 子流线稀疏卷积
                # Fsp.indice_subm_conv和Fsp.indice_conv经function.py中的SubMConvFunction和SparseConvFunction对象辗转还是会继续调用ops模块中的indice_conv等函数。
                # 最终,他们都会以torch.ops.spconv.xx的形式调用c++扩展共享库中的api来完成任务
                out_features = Fsp.indice_subm_conv(features,           # 输入特征(N,5)
                                                    self.weight,        # 权重(27*16*32)
                                                    indice_pairs.to(device),  # [2, 27, N]
                                                    indice_pair_num,    # [27],用于保存卷积核每一个位置上的总的计算的次数
                                                    outids.shape[0],    # N
                                                    self.algo           # 获取枚举标签的值,0,1,2分别对应3中内存分配方式
                                                    )
            else:
                if self.inverse:
                    out_features = Fsp.indice_inverse_conv(features, self.weight, indice_pairs.to(device),indice_pair_num, outids.shape[0], self.algo)
                else:
                    out_features = Fsp.indice_conv(features, self.weight,indice_pairs.to(device),indice_pair_num,outids.shape[0], self.algo)

            if self.bias is not None:
                out_features += self.bias
        out_tensor = spconv.SparseConvTensor(out_features, outids,out_spatial_shape, batch_size)
        out_tensor.indice_dict = input.indice_dict
        out_tensor.grid = input.grid
        return out_tensor

Fsp.indice_subm_convFsp.indice_convspconv/functional.py中的SubMConvFunctionSparseConvFunction对象辗转还是会继续调用spconv/ops.py模块中的indice_conv等函数。

先看子流线卷积的接口:indice_subm_conv,代码:spconv/functional.py

class SubMConvFunction(Function):
    @staticmethod
    def forward(ctx, features, filters, indice_pairs, indice_pair_num,num_activate_out, algo):
        ctx.save_for_backward(indice_pairs, indice_pair_num, features, filters)
        ctx.algo = algo
        return ops.indice_conv(features,        # 输入特征(N,5)
                               filters,         # 权重(27*16*32)
                               indice_pairs,    # [2, 27, N]
                               indice_pair_num, # [27],用于保存卷积核每一个位置上的总的计算的次数
                               num_activate_out,# N 输出有效active个数
                               False,           # 反卷积 false
                               True,            # 子流线卷积默认 true
                               algo=algo        # 获取枚举标签的值,0,1,2分别对应3中内存分配方式                 
                               )

    @staticmethod
    def backward(ctx, grad_output):
        indice_pairs, indice_pair_num, features, filters = ctx.saved_tensors
        input_bp, filters_bp = ops.indice_conv_backward(features,
                                                        filters,
                                                        grad_output,
                                                        indice_pairs,
                                                        indice_pair_num,
                                                        False,
                                                        True,
                                                        algo=ctx.algo)

        return input_bp, filters_bp, None, None, None, None

indice_subm_conv = SubMConvFunction.apply

直接用Python接口调用 C++ 函数不太“美观”,这里用torch.autograd.Function来封装算子的底层调用。

Function 类本身表示 PyTorch 的一个可导函数,只要为其定义了前向推理和反向传播的实现,我们就可以把它当成一个普通 PyTorch 函数来使用,PyTorch 会自动调度该函数,合适地执行前向和反向计算。

扩充:对模型部署来说,Function 类有一个很好的性质:如果它定义了 symbolic静态方法,该 Function 在执行 torch.onnx.export() 时就可以根据symbolic中定义的规则转换成 ONNX算子

applytorch.autograd.Function 的一个方法,这个方法完成了 Function 在前向推理或者反向传播时的调度,使用 indice_subm_conv = SubMConvFunction.apply 把这个调用方法取了一个更简短的别名 indice_subm_conv,以后在使用 indice_subm_conv 算子时,我们可以忽略 SubMConvFunction 的实现细节,而只通过indice_subm_conv 这个接口来访问算子。

SubMConvFunction的前向传播forward调用spconv/ops.pyindice_conv方法

def indice_conv(features,
                filters,
                indice_pairs,
                indice_pair_num,
                num_activate_out,
                inverse=False,
                subm=False,
                algo=ConvAlgo.Native.value):
    return torch.ops.spconv.indice_conv(features, filters, indice_pairs,indice_pair_num, num_activate_out,int(inverse), int(subm), algo)

src/spconv/all.cc文件中通过Pytorch提供的OP Register对底层c++ api进行了注册

static auto registry =
    torch::RegisterOperators()
        .op("spconv::get_indice_pairs", &spconv::getIndicePairs)
        .op("spconv::indice_conv", &spconv::indiceConv)

通过torch.ops.load_library加载.so文件, torch.ops.spconv.indice_conv方式来调用src/spconv/spconv_ops.cc文件中的indiceConv函数

下面具体看src/spconv/spconv_ops.ccindiceConv函数:

torch::Tensor indiceConv(torch::Tensor features,   // 输入特征(N,5)
                         torch::Tensor filters,    // 权重(27*16*32)
                         torch::Tensor indicePairs,// [2, 27, N]
                         torch::Tensor indiceNum,  // [27],用于保存卷积核每一个位置上的总的计算的次数
                         int64_t numActOut,        // N 输出有效active个数
                         int64_t _inverse,         // 反卷积 false
                         int64_t _subM,            // 子流线卷积默认 true
                         int64_t algo              // 获取枚举标签的值,0,1,2分别对应3中内存分配方式  
                         ) {
  auto kernelVolume = indiceNum.size(0); // N
  switch (algo) {         // 默认0
  case kBatchGemmGather:  // 2
  case kBatch: {          // 1
    if (kernelVolume != 1) {
      return indiceConvBatch(features, filters, indicePairs, indiceNum,numActOut, _inverse, _subM,algo != kBatchGemmGather);
    } 
    else {
      break;
    }
  }
  case kNative:
    break;
  default:
    TV_THROW_RT_ERR("unknown algo");
  }
  // auto timer = spconv::CudaContextTimer<>();

  bool subM = _subM != 0;
  bool inverse = _inverse != 0;
  auto device = features.device().type();
  auto ndim = filters.dim() - 2;
  auto numInPlanes = features.size(1);
  auto numOutPlanes = filters.size(ndim + 1);
  auto indicePairNumCpu = indiceNum.to({torch::kCPU});

  auto options =
      torch::TensorOptions().dtype(features.dtype()).device(features.device());
  torch::Tensor output = torch::zeros({numActOut, numOutPlanes}, options);
  filters = filters.view({-1, numInPlanes, numOutPlanes});

  // init for subM
  int indicePairMaxOffset = kernelVolume / 2;
  int indicePairMaxSize = numActOut;
  if (subM) { // the center index of subm conv don't need gather and scatter
    // add.
    torch::mm_out(output, features, filters[indicePairMaxOffset]);

    // get indice pair second max size based on subM symmetric property
    indicePairMaxSize =
      *std::max_element(indicePairNumCpu.data_ptr<int>(),
                        indicePairNumCpu.data_ptr<int>() + indicePairMaxOffset);
    if (indicePairMaxSize == 0) {
      return output;
    }
  } else {
    indicePairMaxSize =
      *std::max_element(indicePairNumCpu.data_ptr<int>(),
                        indicePairNumCpu.data_ptr<int>() + kernelVolume);
  }

  torch::Tensor inputBuffer =
      torch::empty({indicePairMaxSize, numInPlanes}, options);
  torch::Tensor outputBuffer =
      torch::empty({indicePairMaxSize, numOutPlanes}, options);

  double totalGatherTime = 0;
  double totalGEMMTime = 0;
  double totalSAddTime = 0;
  // tv::ssprint("first subm gemm time", timer.report() / 1000.0);

  for (int i = 0; i < kernelVolume; ++i) {
    auto nHot = indicePairNumCpu.data_ptr<int>()[i];
    if (nHot <= 0 || (subM && i == indicePairMaxOffset)) {
      continue;
    }
    // TODO torch::from_blob is a little slow
    auto outputBufferBlob = torch::from_blob(outputBuffer.data_ptr(),
                                             {nHot, numOutPlanes}, options);
    auto inputBufferBlob =
        torch::from_blob(inputBuffer.data_ptr(), {nHot, numInPlanes}, options);

    if (device == torch::kCPU) {
      sparse_gather_cpu(inputBuffer, features, indicePairs[inverse][i], nHot);
    }
#ifdef TV_CUDA
    else if (device == torch::kCUDA) {
      sparse_gather_cuda(inputBuffer, features, indicePairs[inverse][i], nHot);
      /* slower than SparseGatherFunctor, may due to int->long conversion
      auto indicePairLong = indicePairs[i][inverse].to(torch::kInt64);
      auto indicePairBlob = torch::from_blob(indicePairLong.data<long>(),
      {nHot}, indicePairOptions); torch::index_select_out(inputBufferBlob,
      features, 0, indicePairBlob);*/
    }
#endif
    else {
      TV_THROW_INVALID_ARG("unknown device type");
    }
    // totalGatherTime += timer.report() / 1000.0;
    torch::mm_out(outputBufferBlob, inputBufferBlob, filters[i]);
    // totalGEMMTime += timer.report() / 1000.0;

    if (device == torch::kCPU) {
      sparse_scatter_add_cpu(outputBuffer, output, indicePairs[!inverse][i],
                             nHot);
    }
#ifdef TV_CUDA
    else if (device == torch::kCUDA) {
      sparse_scatter_add_cuda(outputBuffer, output, indicePairs[!inverse][i],
                              nHot);
    }
#endif
    else {
      TV_THROW_INVALID_ARG("unknown device type");
    }
    // totalSAddTime += timer.report() / 1000.0;
  }
  // tv::ssprint(totalGatherTime, totalGEMMTime, totalSAddTime);
  return output;
}

代写!!