普通3d稀疏卷积RuleBook构建
我们继续看普通稀疏卷积RuleBook的建立过程,返回src/spconv/spconv_ops.cc
,看getIndicePairs
函数的普通3D稀疏卷积部分
// torch.numel()统计元素的个数 N*2*27/2+1
auto indicePairUnique = torch::full({indicePairs.numel() / 2 + 1}, std::numeric_limits<int>::max(),torch::dtype(torch::kInt32).device(indices.device()));
// [N*27,4]
torch::Tensor outInds = torch::zeros({numAct * kernelVolume, coorDim + 1},torch::dtype(torch::kInt32).device(indices.device()));
if (indices.device().type() == torch::kCPU) { // CPU
numActOut = create_conv_indice_pair_cpu(indices, outInds, gridOut, indicePairs, indiceNum, kernelSize, stride,padding, dilation, outSpatialShape, transpose, false, useHash);
}
#ifdef TV_CUDA
else if (indices.device().type() == torch::kCUDA) { // GPU
numActOut = create_conv_indice_pair_p1_cuda(indices, // torch.Size([N, 4]) voxel空间索引
indicePairs, // torch.Size([2,27,N]),-1填充 保存 rulebook
indiceNum, // torch.Size([27]) 用于保存卷积核每一个位置上的总的计算的次数
indicePairUnique, // N*27+1
kernelSize, // [3,3,3]
stride, // [2,2,2]
padding, // [1,1,1]
dilation, // [0,0,0]
outSpatialShape, // [21, 720, 720]
transpose // False
);
if (numActOut > 0) {
auto res = torch::_unique(indicePairUnique);
indicePairUnique = std::get<0>(res);
numActOut = create_conv_indice_pair_p2_cuda(indices,
outInds,
gridOut,
indicePairs,
indiceNum,
indicePairUnique,
outSpatialShape,
transpose,
false,
useHash);
if (numActOut == -1) {
auto device = indices.device();
outInds = outInds.to({torch::kCPU});
indicePairs = indicePairs.to({torch::kCPU});
indiceNum = indiceNum.to({torch::kCPU});
indices = indices.to({torch::kCPU});
numActOut = create_conv_indice_pair_cpu(indices, outInds, gridOut, indicePairs, indiceNum, kernelSize,stride, padding, dilation, outSpatialShape, transpose, false,useHash);
return {outInds.to(device).slice(0, 0, numActOut),indicePairs.to(device), indiceNum.to(device)};
}
}
}
普通3d稀疏卷积调用create_conv_indice_pair_p1_cuda
和create_conv_indice_pair_p2_cuda
,我们先看create_conv_indice_pair_p1_cuda
函数,位于src/spconv/indice.cu
int create_conv_indice_pair_p1_cuda(
torch::Tensor indicesIn, // torch.Size([N, 4])
torch::Tensor indicePairs, // torch.Size([2,27,N]),-1填充 保存 rulebook
torch::Tensor indiceNum, // torch.Size([27]) 用于保存卷积核每一个位置上的总的计算的次数
torch::Tensor indicePairUnique, // N*27+1
std::vector<int64_t> kernelSize, // [3,3,3]
std::vector<int64_t> stride, // [2,2,2]
std::vector<int64_t> padding, // [1,1,1]
std::vector<int64_t> dilation, // [0,0,0]
std::vector<int64_t> outSpatialShape, // [21, 720, 720]
bool transpose // False
) {
auto stream = at::cuda::getCurrentCUDAStream();
auto ndim = kernelSize.size(); // 3
auto numActIn = indicesIn.size(0); // N
auto kernelVolume = indiceNum.size(0); // 27
if (numActIn == 0)
return 0;
tv::dispatch_torch<int32_t>(indicesIn.scalar_type(), [&](auto IndexValue) {
using Index = TV_DECLTYPE(IndexValue);
using IndexGrid = int32_t;
tv::dispatch_int<2, 3, 4>(ndim, [&](auto I) {
constexpr int NDim = TV_DECLTYPE(I)::value;
// 将参数信息复制到tv::SimpleVector类型相关变量上
tv::SimpleVector<Index, NDim> ks(kernelSize.begin(), kernelSize.end());
tv::SimpleVector<Index, NDim> st(stride.begin(), stride.end());
tv::SimpleVector<Index, NDim> pa(padding.begin(), padding.end());
tv::SimpleVector<Index, NDim> di(dilation.begin(), dilation.end());
tv::SimpleVector<Index, NDim> ou(outSpatialShape.begin(),outSpatialShape.end());
tv::DispatchInt<max_kernel_vol_t>()(kernelVolume, std::less_equal<int>(), [&](auto I2) {
constexpr int MaxKernelVolume = TV_DECLTYPE(I2)::value;
if (transpose) { // False
prepareDeConvIndicePairsKernel<Index, NDim, MaxKernelVolume>
<<<tv::cuda::getBlocks(numActIn), tv::cuda::CUDA_NUM_THREADS,
0, stream>>>(tv::torch2tv<Index>(indicesIn),
tv::torch2tv<Index>(indicePairs),
tv::torch2tv<Index>(indiceNum),
tv::torch2tv<Index>(indicePairUnique), ks, st,pa, di, ou);
TV_CHECK_CUDA_ERR_V2("prepareDeConvIndicePairsKernel failed");
} else {
prepareIndicePairsKernel<Index, NDim, MaxKernelVolume>
<<<tv::cuda::getBlocks(numActIn), tv::cuda::CUDA_NUM_THREADS,0, stream>>>(
tv::torch2tv<Index>(indicesIn), // torch.Size([N, 4])
tv::torch2tv<Index>(indicePairs), // torch.Size([2,27,N]),-1填充 保存 rulebook
tv::torch2tv<Index>(indiceNum), // torch.Size([27]) 用于保存卷积核每一个位置上的总的计算的次数
tv::torch2tv<Index>(indicePairUnique),
ks, // 卷积核尺寸
st, // 步长
pa, // 填充
di, // 膨胀卷积
ou // 输出形状
);
TV_CHECK_CUDA_ERR_V2("prepareIndicePairsKernel failed");
}
#ifdef TV_LOG_KERNEL_INFO
cudaFuncAttributes attr;
checkCudaErrors(cudaFuncGetAttributes(
&attr,
prepareDeConvIndicePairsKernel<Index, NDim, MaxKernelVolume>));
tv::ssprint("prepareIndicePairsKernel<", tv::type_s<Index>, NDim,
MaxKernelVolume, ">", attr.numRegs);
#endif
});
});
});
return 1;
}
重点看prepareIndicePairsKernel
核函数
template <typename Index, unsigned NDim, int KernelMaxVolume = 256,typename Index1D = int>
__global__ void prepareIndicePairsKernel(
tv::TensorView<const Index> indicesIn, // torch.Size([N, 4])
tv::TensorView<Index> indicePairs, // torch.Size([2,27,N]),-1填充 保存 rulebook
tv::TensorView<Index> indiceNum, // torch.Size([27]) 用于保存卷积核每一个位置上的总的计算的次数
tv::TensorView<Index1D> indicePairUnique,
const tv::SimpleVector<Index, NDim> kernelSize, // 卷积核尺寸
const tv::SimpleVector<Index, NDim> stride, // 步长
const tv::SimpleVector<Index, NDim> padding, // 填充
const tv::SimpleVector<Index, NDim> dilation, // 膨胀卷积
const tv::SimpleVector<Index, NDim> outSpatialShape // 输出形状[21, 720, 720]
) {
auto numActIn = indicesIn.dim(0); // N
Index spatialVolume = 1; // 没用到
#pragma unroll
for (int i = 0; i < NDim; ++i) {
spatialVolume *= outSpatialShape[i]; // 21*720*720
}
Index kernelVolume = 1;
#pragma unroll
for (int i = 0; i < NDim; ++i) {
kernelVolume *= kernelSize[i]; // 3*3*3
}
Index numValidPoints = 0;
Index validPoints[KernelMaxVolume * (NDim + 1)]; // 27*4
Index *pointPtr = nullptr;
auto indicePairsDim2 = indicePairs.dim(2); // N
Index index;
for (int ix : tv::KernelLoopX<int>(numActIn)) {
numValidPoints = getValidOutPos<Index, NDim>(
indicesIn.data() + ix * (NDim + 1) + 1,
kernelSize.data(), // 卷积核尺寸 3,3,3
stride.data(), // 步长 2,2,2
padding.data(), // 填充 1,1,1
dilation.data(), // 膨胀卷积 1,1,1
outSpatialShape.data(), // 输出形状[21, 720, 720]
validPoints // 输出哈希表
);
// 依靠 getValidOutPos 计算得到的 out 数组完成rulebook的建立
for (Index i = 0; i < numValidPoints; ++i) {
pointPtr = validPoints + i * (NDim + 1);
auto offset = pointPtr[NDim]; // 表示输出用到卷积核那个weight来计算
Index oldNum = atomicAdd(indiceNum.data() + offset, Index(1)); // 代表Rulebook的count
// 输入张量到输入序号
indicePairs(0, offset, oldNum) = ix;
// 输出序号
index = tv::ArrayIndexRowMajor<NDim, NDim>::runPtrs(
pointPtr, outSpatialShape.data(), 0) +
spatialVolume * indicesIn(ix, 0);
// 输出张量到输出序号
indicePairs(1, offset, oldNum) = index;
indicePairUnique[offset * indicePairsDim2 + oldNum] = index;
}
}
}
getValidOutPos
作用根据输入点计算输出哈希表和输出所用到的卷积核权重的位置,同时返回有效输出个数
直接看下列代码,注释比较详细了
template <typename Index, unsigned NDim>
TV_HOST_DEVICE Index getValidOutPos(const Index *input_pos, // 有效active点的输入位置坐标
const Index *kernelSize, // [3,3,3]
const Index *stride, // [2,2,2]
const Index *padding, // [1,1,1]
const Index *dilation, // [0,0,0]
const Index *outSpatialShape, // [21, 720, 720]
Index *out // 输出哈希表
) {
Index lowers[NDim]; // 输入点对应输出点坐标的上限
Index uppers[NDim]; // 输入点对应输出点坐标的下限
Index counter[NDim];
Index counterSize[NDim];// 各个维度的输出点数
Index pointCounter = 0; // 有效的输出点数
Index val; // 输出序号
Index numPoints = 1;
Index m, offset;
bool valid = false;
#pragma unroll
// 在各个维度上计算输入点对应输出点的上限和下限
for (int i = 0; i < NDim; ++i) {
lowers[i] = (input_pos[i] - (kernelSize[i] - 1) * dilation[i] - 1 + stride[i] + padding[i]) / stride[i];
uppers[i] = (input_pos[i] + padding[i]) / stride[i];
}
#pragma unroll
// 计算每个输入对应输出点数numPoints
for (unsigned i = 0; i < NDim; ++i) {
counterSize[i] = ((uppers[i] - lowers[i]) / dilation[i] + 1);
numPoints *= counterSize[i];
}
#pragma unroll
// 初始化
for (int i = 0; i < NDim; ++i) {
counter[i] = 0;
}
// 对输出数组做一个有效的填充,可以把out理解为一个[N][Ndim+1]的二维数组
// 每一行表示一个输出位置i,out[i][0],...,out[i][Ndim-1]存储第i个输出位置的索引
for (int i = 0; i < numPoints; ++i) {
valid = true;
m = 1;
offset = 0;
#pragma unroll
// 各特征维度遍历,存储第i个输出位置的各个维度索引
for (int j = NDim - 1; j >= 0; --j) {
val = uppers[j] - counter[j] * dilation[j];// 输出序号
// 输入对应的输出哈希表
out[pointCounter * (NDim + 1) + j] = val;
// 越界
if (val < 0 || (val > outSpatialShape[j] - 1)) {
valid = false;
// break;
}
// 输入对应每个输出点在卷积核上的偏移
offset += m * (input_pos[j] - val * stride[j] + padding[j]) / dilation[j];
m *= kernelSize[j]; // m*3
}
// out[i][Ndim]存储于输入相作用kernel的偏移(offset)(即用卷积核中的那个权重计算)
out[pointCounter * (NDim + 1) + NDim] = offset;
if (valid)
++pointCounter;
// 让counter[2]值在0~counterSize[2]循环变化
counter[NDim - 1] += 1;
#pragma unroll
// 下面for循环作用:计算输出点各个维度的索引值(即counter[2],counter[1],counter[0])
// 遍历完第2维度后,有counter[2]=counterSize[2]-->counter[1]++,counter[2]=0
// 遍历完第1维度后,有counter[1]=counterSize[1]-->counter[0]++,counter[1]=0
// 遍历完第0维度后,有counter[1]=counterSize[1]
for (int c = NDim - 1; c >= 0; --c) {
if (counter[c] == counterSize[c] && c > 0) {
counter[c - 1] += 1;
counter[c] = 0;
}
}
}
return pointCounter;
}
关于输出上下限如何得出,计算过程如下:
以 1-dim
卷积为例:给定输入点的输出点取决于内核大小 k
、步长 s
、扩张 d
和填充 p
对于输入位置 x,它到特征图边界的距离为:x+p
假设输出点的最小值为n,有以下关系:
s * (n-1) + k^{'} = x + p
其中k’是有效内核大小,它取决于内核大小和膨胀: k’=(k-1)*(d-1)+k
带入k’等式变为:
s*(n-1)+(k-1)*(d-1)+k=x+p
重新排列,计算lowers为:
n = {(x-d*(k-1)-1+s+p)}/{s}
同理,假设输出点的最大值为n,则有如下关系:
s*n=x+p
则计算uppers为:
n=(x+p)/s
参考:https://github.com/traveller59/spconv/issues/224
对于counter变量含义可以参考注释代码,如哪些地方理解有误,也麻烦大家指出来。
create_conv_indice_pair_p2_cuda
位于:src/spconv/indice.cu
int create_conv_indice_pair_p2_cuda(
torch::Tensor indicesIn, // torch.Size([N, 4]) indices
torch::Tensor indicesOut, // torch.Size([N*27, 4])
torch::Tensor gridsOut, // [4,21*720*720]
torch::Tensor indicePairs, // torch.Size([2,27,N])
torch::Tensor indiceNum, // torch.Size([27]) 用于保存卷积核每一个位置上的总的计算的次数
torch::Tensor indicePairUnique, // N*27+1
std::vector<int64_t> outSpatialShape, // [21, 720, 720]
bool transpose, // False
bool resetGrid, // False
bool useHash // False
) {
auto stream = at::cuda::getCurrentCUDAStream();
auto ndim = outSpatialShape.size(); // 3
auto numActIn = indicesIn.size(0); // N
int batchSize = gridsOut.size(0); // 4
int numAct = indicePairUnique.size(0) - 1;// 不重复输出序号个数-1
auto kernelVolume = indiceNum.size(0);
if (numActIn == 0)
return 0;
bool failed = false;
tv::dispatch_torch<int32_t>(indicesIn.scalar_type(), [&](auto IndexValue) {
using Index = TV_DECLTYPE(IndexValue);
using IndexGrid = int32_t;
tv::dispatch_int<2, 3, 4>(ndim, [&](auto I) {
constexpr int NDim = TV_DECLTYPE(I)::value;
using IndexGrid = int32_t;
tv::SimpleVector<Index, NDim> ou(outSpatialShape.begin(),outSpatialShape.end());
if (useHash) { // False
...... // 略
} else { // True
assignGridAndIndiceOutKernel<Index, IndexGrid, NDim>
<<<tv::cuda::getBlocks(numAct), tv::cuda::CUDA_NUM_THREADS, 0,stream>>>(
tv::torch2tv<Index>(indicesOut), // torch.Size([N*27, 4])
tv::torch2tv<IndexGrid>(gridsOut), // [4,21*720*720]
numAct, // 不重复输出序号个数-1
tv::torch2tv<Index>(indicePairs), // torch.Size([2,27,N])
tv::torch2tv<Index>(indicePairUnique), // 不重复输出序号
ou, // 输出形状
batchSize // 4
);
TV_CHECK_CUDA_ERR_V2("assignGridAndIndiceOutKernel failed");
assignIndicePairsKernel<Index, IndexGrid, NDim>
<<<tv::cuda::getBlocks(numActIn), tv::cuda::CUDA_NUM_THREADS, 0,
stream>>>(tv::torch2tv<Index>(indicesOut),
tv::torch2tv<IndexGrid>(gridsOut), numActIn,
tv::torch2tv<Index>(indicePairs),
tv::torch2tv<Index>(indicePairUnique), ou);
TV_CHECK_CUDA_ERR_V2("assignIndicePairsKernel failed");
#ifdef TV_LOG_KERNEL_INFO
...... // 日志略
#endif
}
if (resetGrid && (!useHash)) { // False
resetGridKernel<Index, IndexGrid, NDim>
<<<tv::cuda::getBlocks(numAct), tv::cuda::CUDA_NUM_THREADS, 0,stream>>>(indicePairUnique.data_ptr<Index>(),tv::torch2tv<IndexGrid>(gridsOut), numAct);
TV_CHECK_CUDA_ERR_V2("resetGridKernel failed");
}
});
});
if (failed){
return -1;
}
return numAct;
}
assignGridAndIndiceOutKernel
位于:include/spconv/indice.cu.h
template <typename Index, typename IndexGrid, unsigned NDim>
__global__ void assignGridAndIndiceOutKernel(
tv::TensorView<Index> indicesOut, // torch.Size([N*27, 4]) 需要计算的
tv::TensorView<IndexGrid> gridsOut, // [4,21*720*720] 需要计算的
int numAct, // 不重复输出序号个数-1
tv::TensorView<Index> indicePairs, // torch.Size([2,27,N])
tv::TensorView<Index> indicePairUnique, // 不重复输出序号
const tv::SimpleVector<Index, NDim> outSpatialShape, // 输出形状
int batchSize // 4
) {
Index index;
auto indicesOutPtr = indicesOut.data();
for (int ix : tv::KernelLoopX<int>(numAct)) {
index = indicePairUnique[ix];
gridsOut[index] = ix;
index = tv::rowArrayIdxInv<Index, NDim>(index, indicesOutPtr + ix * (NDim + 1) + 1, outSpatialShape.data());
indicesOut[ix * (NDim + 1)] = index % batchSize;
}
}
rowArrayIdxInv
位于:include/tensorview/tensorview.h
template <typename Index, unsigned NDim>
TV_HOST_DEVICE_INLINE Index rowArrayIdxInv(Index index, Index *output,const Index *shape) {
#pragma unroll
for (int i = NDim - 1; i >= 0; --i) {
output[i] = index % shape[i];
index -= output[i];
index /= shape[i];
}
return index;
}
继续看assignIndicePairsKernel
:
template <typename Index, typename IndexGrid, unsigned NDim>
__global__ void
assignIndicePairsKernel(tv::TensorView<Index> indicesOut,
tv::TensorView<IndexGrid> gridsOut, int numActIn,
tv::TensorView<Index> indicePairs,
tv::TensorView<Index> indicePairUnique,
const tv::SimpleVector<Index, NDim> outSpatialShape) {
Index index;
int kernelVolume = indicePairs.dim(1);
auto indicePairsOut = indicePairs.subview(1); // 从rulebook中获取输出张量到输出序号的哈希表
for (int ix : tv::KernelLoopX<int>(numActIn)) {
for (int i = 0; i < kernelVolume; ++i) {
index = indicePairsOut(i, ix);
if (index > -1) {
indicePairsOut(i, ix) = gridsOut[index];
}
}
}
}
subview
位于:include/tensorview/tensorview.h
,意思应该获取子集
TV_HOST_DEVICE_INLINE TensorView<T, -1, PtrTraits, Tindex>
subview(SimpleVector<int> ids) const {
Shape start = ids;
for (int i = ids.size(); i < ndim(); ++i) {
start.push_back(0);
}
return TensorView<T, Rank, PtrTraits, Tindex>(
ptr_ + rowArrayIdx(shape_, start), shape_.subshape(ids.size()));
}
评论(0)
您还未登录,请登录后发表或查看评论