普通3d稀疏卷积RuleBook构建

我们继续看普通稀疏卷积RuleBook的建立过程,返回src/spconv/spconv_ops.cc,看getIndicePairs函数的普通3D稀疏卷积部分

    // torch.numel()统计元素的个数 N*2*27/2+1
    auto indicePairUnique = torch::full({indicePairs.numel() / 2 + 1}, std::numeric_limits<int>::max(),torch::dtype(torch::kInt32).device(indices.device()));
    // [N*27,4]
    torch::Tensor outInds = torch::zeros({numAct * kernelVolume, coorDim + 1},torch::dtype(torch::kInt32).device(indices.device()));
    if (indices.device().type() == torch::kCPU) { // CPU
      numActOut = create_conv_indice_pair_cpu(indices, outInds, gridOut, indicePairs, indiceNum, kernelSize, stride,padding, dilation, outSpatialShape, transpose, false, useHash);
    }
#ifdef TV_CUDA
    else if (indices.device().type() == torch::kCUDA) { // GPU
      numActOut = create_conv_indice_pair_p1_cuda(indices,          // torch.Size([N, 4]) voxel空间索引 
                                                  indicePairs,      // torch.Size([2,27,N]),-1填充 保存 rulebook
                                                  indiceNum,        // torch.Size([27]) 用于保存卷积核每一个位置上的总的计算的次数
                                                  indicePairUnique, // N*27+1
                                                  kernelSize,       // [3,3,3]
                                                  stride,           // [2,2,2]
                                                  padding,          // [1,1,1]
                                                  dilation,         // [0,0,0]
                                                  outSpatialShape,  // [21, 720, 720]
                                                  transpose         // False
                                                  );
      if (numActOut > 0) {
        auto res = torch::_unique(indicePairUnique);
        indicePairUnique = std::get<0>(res);
        numActOut = create_conv_indice_pair_p2_cuda(indices, 
                                                    outInds, 
                                                    gridOut, 
                                                    indicePairs, 
                                                    indiceNum, 
                                                    indicePairUnique,
                                                    outSpatialShape, 
                                                    transpose, 
                                                    false, 
                                                    useHash);
        if (numActOut == -1) {
          auto device = indices.device();
          outInds = outInds.to({torch::kCPU});
          indicePairs = indicePairs.to({torch::kCPU});
          indiceNum = indiceNum.to({torch::kCPU});
          indices = indices.to({torch::kCPU});
          numActOut = create_conv_indice_pair_cpu(indices, outInds, gridOut, indicePairs, indiceNum, kernelSize,stride, padding, dilation, outSpatialShape, transpose, false,useHash);

          return {outInds.to(device).slice(0, 0, numActOut),indicePairs.to(device), indiceNum.to(device)};
        }
      }
    }

普通3d稀疏卷积调用create_conv_indice_pair_p1_cudacreate_conv_indice_pair_p2_cuda,我们先看
create_conv_indice_pair_p1_cuda函数,位于src/spconv/indice.cu

int create_conv_indice_pair_p1_cuda(
    torch::Tensor indicesIn,                // torch.Size([N, 4])
    torch::Tensor indicePairs,              // torch.Size([2,27,N]),-1填充 保存 rulebook
    torch::Tensor indiceNum,                // torch.Size([27]) 用于保存卷积核每一个位置上的总的计算的次数
    torch::Tensor indicePairUnique,         // N*27+1
    std::vector<int64_t> kernelSize,        // [3,3,3]
    std::vector<int64_t> stride,            // [2,2,2]
    std::vector<int64_t> padding,           // [1,1,1]
    std::vector<int64_t> dilation,          // [0,0,0]
    std::vector<int64_t> outSpatialShape,   // [21, 720, 720]
    bool transpose                          // False
    ) {
  auto stream = at::cuda::getCurrentCUDAStream();
  auto ndim = kernelSize.size();         // 3
  auto numActIn = indicesIn.size(0);     // N
  auto kernelVolume = indiceNum.size(0); // 27
  if (numActIn == 0)
    return 0;
  tv::dispatch_torch<int32_t>(indicesIn.scalar_type(), [&](auto IndexValue) {
    using Index = TV_DECLTYPE(IndexValue);
    using IndexGrid = int32_t;
    tv::dispatch_int<2, 3, 4>(ndim, [&](auto I) {
      constexpr int NDim = TV_DECLTYPE(I)::value;
      // 将参数信息复制到tv::SimpleVector类型相关变量上
      tv::SimpleVector<Index, NDim> ks(kernelSize.begin(), kernelSize.end());
      tv::SimpleVector<Index, NDim> st(stride.begin(), stride.end());
      tv::SimpleVector<Index, NDim> pa(padding.begin(), padding.end());
      tv::SimpleVector<Index, NDim> di(dilation.begin(), dilation.end());
      tv::SimpleVector<Index, NDim> ou(outSpatialShape.begin(),outSpatialShape.end());
      tv::DispatchInt<max_kernel_vol_t>()(kernelVolume, std::less_equal<int>(), [&](auto I2) {
            constexpr int MaxKernelVolume = TV_DECLTYPE(I2)::value;
            if (transpose) { // False
              prepareDeConvIndicePairsKernel<Index, NDim, MaxKernelVolume>
                  <<<tv::cuda::getBlocks(numActIn), tv::cuda::CUDA_NUM_THREADS,
                     0, stream>>>(tv::torch2tv<Index>(indicesIn),
                                  tv::torch2tv<Index>(indicePairs),
                                  tv::torch2tv<Index>(indiceNum),
                                  tv::torch2tv<Index>(indicePairUnique), ks, st,pa, di, ou);
              TV_CHECK_CUDA_ERR_V2("prepareDeConvIndicePairsKernel failed");
            } else {
              prepareIndicePairsKernel<Index, NDim, MaxKernelVolume>
                  <<<tv::cuda::getBlocks(numActIn), tv::cuda::CUDA_NUM_THREADS,0, stream>>>(
                                    tv::torch2tv<Index>(indicesIn),         // torch.Size([N, 4])
                                    tv::torch2tv<Index>(indicePairs),       // torch.Size([2,27,N]),-1填充 保存 rulebook
                                    tv::torch2tv<Index>(indiceNum),         // torch.Size([27]) 用于保存卷积核每一个位置上的总的计算的次数
                                    tv::torch2tv<Index>(indicePairUnique), 
                                    ks,                                     // 卷积核尺寸
                                    st,                                     // 步长
                                    pa,                                     // 填充
                                    di,                                     // 膨胀卷积
                                    ou                                      // 输出形状
                                    );
              TV_CHECK_CUDA_ERR_V2("prepareIndicePairsKernel failed");
            }
#ifdef TV_LOG_KERNEL_INFO
            cudaFuncAttributes attr;
            checkCudaErrors(cudaFuncGetAttributes(
                &attr,
                prepareDeConvIndicePairsKernel<Index, NDim, MaxKernelVolume>));
            tv::ssprint("prepareIndicePairsKernel<", tv::type_s<Index>, NDim,
                        MaxKernelVolume, ">", attr.numRegs);
#endif
          });
    });
  });
  return 1;
}

重点看prepareIndicePairsKernel核函数

template <typename Index, unsigned NDim, int KernelMaxVolume = 256,typename Index1D = int>
__global__ void prepareIndicePairsKernel(
    tv::TensorView<const Index> indicesIn,  // torch.Size([N, 4])
    tv::TensorView<Index> indicePairs,      // torch.Size([2,27,N]),-1填充 保存 rulebook
    tv::TensorView<Index> indiceNum,        // torch.Size([27]) 用于保存卷积核每一个位置上的总的计算的次数
    tv::TensorView<Index1D> indicePairUnique,
    const tv::SimpleVector<Index, NDim> kernelSize,     // 卷积核尺寸
    const tv::SimpleVector<Index, NDim> stride,         // 步长
    const tv::SimpleVector<Index, NDim> padding,        // 填充
    const tv::SimpleVector<Index, NDim> dilation,       // 膨胀卷积
    const tv::SimpleVector<Index, NDim> outSpatialShape // 输出形状[21, 720, 720]
    ) {
  auto numActIn = indicesIn.dim(0); // N
  Index spatialVolume = 1; // 没用到
#pragma unroll
  for (int i = 0; i < NDim; ++i) {
    spatialVolume *= outSpatialShape[i]; // 21*720*720
  }
  Index kernelVolume = 1;
#pragma unroll
  for (int i = 0; i < NDim; ++i) {
    kernelVolume *= kernelSize[i];      // 3*3*3
  }
  Index numValidPoints = 0;
  Index validPoints[KernelMaxVolume * (NDim + 1)]; // 27*4
  Index *pointPtr = nullptr;
  auto indicePairsDim2 = indicePairs.dim(2); // N
  Index index;
  for (int ix : tv::KernelLoopX<int>(numActIn)) {
    numValidPoints = getValidOutPos<Index, NDim>(
        indicesIn.data() + ix * (NDim + 1) + 1, 
        kernelSize.data(),      // 卷积核尺寸 3,3,3
        stride.data(),          // 步长       2,2,2
        padding.data(),         // 填充       1,1,1     
        dilation.data(),        // 膨胀卷积   1,1,1
        outSpatialShape.data(), // 输出形状[21, 720, 720]
        validPoints             // 输出哈希表
        );
    // 依靠 getValidOutPos 计算得到的 out 数组完成rulebook的建立
    for (Index i = 0; i < numValidPoints; ++i) {
      pointPtr = validPoints + i * (NDim + 1);
      auto offset = pointPtr[NDim]; // 表示输出用到卷积核那个weight来计算
      Index oldNum = atomicAdd(indiceNum.data() + offset, Index(1)); // 代表Rulebook的count
      // 输入张量到输入序号
      indicePairs(0, offset, oldNum) = ix;
      // 输出序号
      index = tv::ArrayIndexRowMajor<NDim, NDim>::runPtrs(
        pointPtr, outSpatialShape.data(), 0) +
        spatialVolume * indicesIn(ix, 0);
      // 输出张量到输出序号
      indicePairs(1, offset, oldNum) = index;
      indicePairUnique[offset * indicePairsDim2 + oldNum] = index;
    }
  }
}

getValidOutPos作用根据输入点计算输出哈希表和输出所用到的卷积核权重的位置,同时返回有效输出个数

直接看下列代码,注释比较详细了

template <typename Index, unsigned NDim>
TV_HOST_DEVICE Index getValidOutPos(const Index *input_pos,  // 有效active点的输入位置坐标
                                    const Index *kernelSize, // [3,3,3]
                                    const Index *stride,     // [2,2,2] 
                                    const Index *padding,    // [1,1,1] 
                                    const Index *dilation,   // [0,0,0]
                                    const Index *outSpatialShape, // [21, 720, 720]
                                    Index *out   // 输出哈希表
                                    ) {
  Index lowers[NDim];     // 输入点对应输出点坐标的上限
  Index uppers[NDim];     // 输入点对应输出点坐标的下限
  Index counter[NDim];
  Index counterSize[NDim];//  各个维度的输出点数
  Index pointCounter = 0; // 有效的输出点数
  Index val;              // 输出序号
  Index numPoints = 1;
  Index m, offset;
  bool valid = false;
#pragma unroll
  // 在各个维度上计算输入点对应输出点的上限和下限
  for (int i = 0; i < NDim; ++i) {
    lowers[i] = (input_pos[i] - (kernelSize[i] - 1) * dilation[i] - 1 + stride[i] + padding[i]) / stride[i];
    uppers[i] = (input_pos[i] + padding[i]) / stride[i];
  }

#pragma unroll
  // 计算每个输入对应输出点数numPoints
  for (unsigned i = 0; i < NDim; ++i) {
    counterSize[i] = ((uppers[i] - lowers[i]) / dilation[i] + 1);
    numPoints *= counterSize[i];
  }

#pragma unroll
  // 初始化
  for (int i = 0; i < NDim; ++i) {
    counter[i] = 0;
  }

  // 对输出数组做一个有效的填充,可以把out理解为一个[N][Ndim+1]的二维数组
  // 每一行表示一个输出位置i,out[i][0],...,out[i][Ndim-1]存储第i个输出位置的索引
  for (int i = 0; i < numPoints; ++i) {
    valid = true;
    m = 1;
    offset = 0;
#pragma unroll
    // 各特征维度遍历,存储第i个输出位置的各个维度索引
    for (int j = NDim - 1; j >= 0; --j) {
      val = uppers[j] - counter[j] * dilation[j];// 输出序号
      // 输入对应的输出哈希表
      out[pointCounter * (NDim + 1) + j] = val;
      // 越界
      if (val < 0 || (val > outSpatialShape[j] - 1)) {
        valid = false;
        // break;
      }
      // 输入对应每个输出点在卷积核上的偏移
      offset += m * (input_pos[j] - val * stride[j] + padding[j]) / dilation[j];
      m *= kernelSize[j]; // m*3
    }
    // out[i][Ndim]存储于输入相作用kernel的偏移(offset)(即用卷积核中的那个权重计算)
    out[pointCounter * (NDim + 1) + NDim] = offset;
    if (valid)
      ++pointCounter;
    // 让counter[2]值在0~counterSize[2]循环变化
    counter[NDim - 1] += 1; 
#pragma unroll
    // 下面for循环作用:计算输出点各个维度的索引值(即counter[2],counter[1],counter[0])
    // 遍历完第2维度后,有counter[2]=counterSize[2]-->counter[1]++,counter[2]=0
    // 遍历完第1维度后,有counter[1]=counterSize[1]-->counter[0]++,counter[1]=0
    // 遍历完第0维度后,有counter[1]=counterSize[1]
    for (int c = NDim - 1; c >= 0; --c) {
      if (counter[c] == counterSize[c] && c > 0) {
        counter[c - 1] += 1;
        counter[c] = 0;
      }
    }
  }
  return pointCounter;
}

关于输出上下限如何得出,计算过程如下:

1-dim卷积为例:给定输入点的输出点取决于内核大小 k、步长 s、扩张 d 和填充 p

对于输入位置 x,它到特征图边界的距离为:x+p

假设输出点的最小值为n,有以下关系:

s * (n-1) + k^{'} = x + p

其中k’是有效内核大小,它取决于内核大小和膨胀: k’=(k-1)*(d-1)+k

带入k’等式变为:

s*(n-1)+(k-1)*(d-1)+k=x+p

重新排列,计算lowers为:

n = {(x-d*(k-1)-1+s+p)}/{s}

同理,假设输出点的最大值为n,则有如下关系:

s*n=x+p

则计算uppers为:

n=(x+p)/s

参考:https://github.com/traveller59/spconv/issues/224

对于counter变量含义可以参考注释代码,如哪些地方理解有误,也麻烦大家指出来。

create_conv_indice_pair_p2_cuda位于:src/spconv/indice.cu

int create_conv_indice_pair_p2_cuda(
    torch::Tensor indicesIn,                // torch.Size([N, 4]) indices
    torch::Tensor indicesOut,               // torch.Size([N*27, 4])
    torch::Tensor gridsOut,                 // [4,21*720*720]
    torch::Tensor indicePairs,              // torch.Size([2,27,N])
    torch::Tensor indiceNum,                // torch.Size([27]) 用于保存卷积核每一个位置上的总的计算的次数
    torch::Tensor indicePairUnique,         // N*27+1
    std::vector<int64_t> outSpatialShape,   // [21, 720, 720]
    bool transpose,                         // False
    bool resetGrid,                         // False
    bool useHash                            // False
    ) {
  auto stream = at::cuda::getCurrentCUDAStream();
  auto ndim = outSpatialShape.size(); // 3
  auto numActIn = indicesIn.size(0);  // N
  int batchSize = gridsOut.size(0);   // 4
  int numAct = indicePairUnique.size(0) - 1;// 不重复输出序号个数-1

  auto kernelVolume = indiceNum.size(0);
  if (numActIn == 0)
    return 0;
  bool failed = false;
  tv::dispatch_torch<int32_t>(indicesIn.scalar_type(), [&](auto IndexValue) {
    using Index = TV_DECLTYPE(IndexValue);
    using IndexGrid = int32_t;
    tv::dispatch_int<2, 3, 4>(ndim, [&](auto I) {
      constexpr int NDim = TV_DECLTYPE(I)::value;
      using IndexGrid = int32_t;
      tv::SimpleVector<Index, NDim> ou(outSpatialShape.begin(),outSpatialShape.end());
      if (useHash) { // False
          ...... // 略
      } else {   // True
        assignGridAndIndiceOutKernel<Index, IndexGrid, NDim>
            <<<tv::cuda::getBlocks(numAct), tv::cuda::CUDA_NUM_THREADS, 0,stream>>>(
                tv::torch2tv<Index>(indicesOut),    // torch.Size([N*27, 4])
                tv::torch2tv<IndexGrid>(gridsOut),  // [4,21*720*720] 
                numAct,                             // 不重复输出序号个数-1
                tv::torch2tv<Index>(indicePairs),   // torch.Size([2,27,N])
                tv::torch2tv<Index>(indicePairUnique),  // 不重复输出序号
                ou,                                 // 输出形状
                batchSize                           // 4
                );
        TV_CHECK_CUDA_ERR_V2("assignGridAndIndiceOutKernel failed");
        assignIndicePairsKernel<Index, IndexGrid, NDim>
            <<<tv::cuda::getBlocks(numActIn), tv::cuda::CUDA_NUM_THREADS, 0,
               stream>>>(tv::torch2tv<Index>(indicesOut),
                         tv::torch2tv<IndexGrid>(gridsOut), numActIn,
                         tv::torch2tv<Index>(indicePairs),
                         tv::torch2tv<Index>(indicePairUnique), ou);
        TV_CHECK_CUDA_ERR_V2("assignIndicePairsKernel failed");
#ifdef TV_LOG_KERNEL_INFO
          ...... // 日志略
#endif
      }

      if (resetGrid && (!useHash)) { // False
        resetGridKernel<Index, IndexGrid, NDim>
            <<<tv::cuda::getBlocks(numAct), tv::cuda::CUDA_NUM_THREADS, 0,stream>>>(indicePairUnique.data_ptr<Index>(),tv::torch2tv<IndexGrid>(gridsOut), numAct);
        TV_CHECK_CUDA_ERR_V2("resetGridKernel failed");
      }
    });
  });
  if (failed){
    return -1;
  }
  return numAct;
}

assignGridAndIndiceOutKernel位于:include/spconv/indice.cu.h

template <typename Index, typename IndexGrid, unsigned NDim>
__global__ void assignGridAndIndiceOutKernel(
    tv::TensorView<Index> indicesOut,                     // torch.Size([N*27, 4]) 需要计算的
    tv::TensorView<IndexGrid> gridsOut,                   // [4,21*720*720]        需要计算的
    int numAct,                                           // 不重复输出序号个数-1
    tv::TensorView<Index> indicePairs,                    // torch.Size([2,27,N])
    tv::TensorView<Index> indicePairUnique,               // 不重复输出序号
    const tv::SimpleVector<Index, NDim> outSpatialShape,  // 输出形状
    int batchSize                                         // 4
    ) {

  Index index;
  auto indicesOutPtr = indicesOut.data();
  for (int ix : tv::KernelLoopX<int>(numAct)) {
    index = indicePairUnique[ix];
    gridsOut[index] = ix;

    index = tv::rowArrayIdxInv<Index, NDim>(index, indicesOutPtr + ix * (NDim + 1) + 1, outSpatialShape.data());
    indicesOut[ix * (NDim + 1)] = index % batchSize;
  }
}

rowArrayIdxInv位于:include/tensorview/tensorview.h

template <typename Index, unsigned NDim>
TV_HOST_DEVICE_INLINE Index rowArrayIdxInv(Index index, Index *output,const Index *shape) {
#pragma unroll
  for (int i = NDim - 1; i >= 0; --i) {
    output[i] = index % shape[i];
    index -= output[i];
    index /= shape[i];
  }
  return index;
}

继续看assignIndicePairsKernel

template <typename Index, typename IndexGrid, unsigned NDim>
__global__ void
assignIndicePairsKernel(tv::TensorView<Index> indicesOut,
                        tv::TensorView<IndexGrid> gridsOut, int numActIn,
                        tv::TensorView<Index> indicePairs,
                        tv::TensorView<Index> indicePairUnique,
                        const tv::SimpleVector<Index, NDim> outSpatialShape) {

  Index index;
  int kernelVolume = indicePairs.dim(1);
  auto indicePairsOut = indicePairs.subview(1); // 从rulebook中获取输出张量到输出序号的哈希表

  for (int ix : tv::KernelLoopX<int>(numActIn)) {
    for (int i = 0; i < kernelVolume; ++i) {
      index = indicePairsOut(i, ix);
      if (index > -1) {
        indicePairsOut(i, ix) = gridsOut[index];
      }
    }
  }
}

subview位于:include/tensorview/tensorview.h,意思应该获取子集

  TV_HOST_DEVICE_INLINE TensorView<T, -1, PtrTraits, Tindex>
  subview(SimpleVector<int> ids) const {
    Shape start = ids;
    for (int i = ids.size(); i < ndim(); ++i) {
      start.push_back(0);
    }
    return TensorView<T, Rank, PtrTraits, Tindex>(
        ptr_ + rowArrayIdx(shape_, start), shape_.subshape(ids.size()));
  }