SSD是one-stage目标检测方法,和yolo一样。可以同时进行目标检测和分类,速度很快。
SSD主要流程:
选取合适的模型结构,挑选其中合适的特征层或者所有特征层作为backbone,再之后加上额外的卷积网络,组成SSD网络
选取其中的6层卷积层输出,对卷积层输出做2个操作。
坐标信息卷积处理:num_anchors x 4
分类信息卷积处理:num_anchors x num_classes
预测结果解码
具体代码可以参考CSDN@Bubbliiiing的代码,本次实现的是Resnet50实现SSD检测,这是为了与之前的Faster RCNN做对比,其他代码可以参考他的代码,我将我实现的Resnet 50 backbone贴出来,需要的小伙伴粘贴复制修改参考即可。
import torch.nn.functional as F
import torch
from torch import nn
from torch.nn import init
from torchvision.models import resnet50
class L2Norm(nn.Module):
def __init__(self, n_channels, scale):
super(L2Norm, self).__init__()
self.n_channels = n_channels
self.gamma = scale or None
self.eps = 1e-10
self.weight = nn.Parameter(torch.Tensor(self.n_channels))
self.reset_parameters()
def reset_parameters(self):
init.constant_(self.weight, self.gamma)
def forward(self, x):
norm = x.pow(2).sum(dim=1, keepdim=True).sqrt() + self.eps
# x /= norm
x = torch.div(x, norm)
out = self.weight.unsqueeze(0).unsqueeze(2).unsqueeze(3).expand_as(x) * x
return out
def add_extras(in_channels):
layers = []
# Block 6
# 19,19,1024 -> 10,10,512
layers += [nn.Conv2d(in_channels, 512, kernel_size=3, stride=2, padding=1)]
# Block 7
# 10,10,512 -> 5,5,256
layers += [nn.Conv2d(512, 256, kernel_size=3, stride=2, padding=1)]
# Block 8
# 5,5,256 -> 3,3,128
layers += [nn.Conv2d(256, 128, kernel_size=3, stride=1)]
# Block 9
# 3,3,128 -> 1,1,256
layers += [nn.Conv2d(128, 256, kernel_size=3, stride=1)]
return nn.ModuleList(layers)
def resnet_backbone(pretrained=True, progress=True, **kwargs):
resnet50_model = resnet50(pretrained=pretrained)
model = nn.Sequential(*list(resnet50_model.children()))[:7]
five_outchannels = resnet50_model.layer2[-1].conv3.out_channels
six_outchannels = resnet50_model.layer3[-1].conv3.out_channels
return model, five_outchannels, six_outchannels
class SSD_Resnet50(nn.Module):
def __init__(self, num_classes,pretrained=False):
super(SSD_Resnet50, self).__init__()
self.num_classes = num_classes
# 获取整体的特征和倒数两层的输出channels 是pytorch官方backbone的第6层作为多尺度卷积层第一层,第7层做为多尺度卷积层第二层
self.features, six_outchannels, seven_outchannels = resnet_backbone(pretrained)
# 添加4层的额外卷积层作为多尺度卷积层第三到第六层的输出
self.extras = add_extras(seven_outchannels)
self.L2Norm = L2Norm(1024, 20)
mbox = [4, 6, 6, 6, 4, 4]
loc_layers = []
conf_layers = []
# 获取多尺度卷积层第一层 第二层的坐标信息,分类信息
for index, i in enumerate([six_outchannels, seven_outchannels]):
loc_layers.append(nn.Conv2d(i, mbox[index] * 4, kernel_size=(3, 3), padding=1))
conf_layers.append(nn.Conv2d(i, mbox[index] * num_classes, kernel_size=(3, 3), padding=1))
# 获取多尺度卷积层第三到第六层的坐标信息,分类信息
for k, v in enumerate(self.extras):
k += 2
loc_layers += [nn.Conv2d(v.out_channels, mbox[k] * 4, kernel_size=(3, 3), padding=1)]
conf_layers += [nn.Conv2d(v.out_channels, mbox[k] * num_classes, kernel_size=(3, 3), padding=1)]
self.loc = nn.ModuleList(loc_layers)
self.conf = nn.ModuleList(conf_layers)
def forward(self,x):
sources = list()
loc = list()
conf = list()
x = self.features[0](x)
x = self.features[1](x)
x = self.features[2](x)
x = self.features[3](x)
x = self.features[4](x)
# 获取多尺度卷积层第一层的features输出 shape为39x39
x = self.features[5](x)
sources.append(x)
# 获取多尺度卷积层第二的features输出 18x18
x = self.features[6](x)
# 将特征做归一化处理
x = self.L2Norm(x)
sources.append(x)
# 获取多尺度卷积层剩下第三到第六层的features输出 10x10 5x5 3x3 1x1
for k, v in enumerate(self.extras):
x = F.relu(v(x), inplace=True)
sources.append(x)
for (x, l, c) in zip(sources, self.loc, self.conf):
# 将多尺度卷积层输出的特征变化shape 将[batch, channels, h, w]转化为[batch, h, w, channels]
loc.append(l(x).permute(0, 2, 3, 1).contiguous())
conf.append(c(x).permute(0, 2, 3, 1).contiguous())
# 将多尺度卷积层(6层)直接堆叠起来
loc = torch.cat([o.view(o.size(0), -1) for o in loc], 1)
conf = torch.cat([o.view(o.size(0), -1) for o in conf], 1)
# -------------------------------------------------------------#
# loc会reshape到batch_size, num_anchors, 4
# conf会reshap到batch_size, num_anchors, self.num_classes
# -------------------------------------------------------------#
# 将loc、conf reshape为[batch_size, num_anchors, 4] [batch_size, num_anchors, num_classes]
output = (
loc.view(loc.size(0), -1, 4),
conf.view(conf.size(0), -1, self.num_classes),
)
return output
if __name__ == '__main__':
from torchsummary import summary
model = SSD_Resnet50(10).cuda()
print(summary(model,(3, 300, 300)))
Anchor box
对于Faster RCNN、yolov3、v4、v5都有专属的anchor box,SSD也不例外,首先在我们的多尺度卷积层,会输出mbox(4,6,6,6 ,4, 4)个aspect_ratios,比如说在resnet的第六层(38x38)的每个像素点输出4种大小比例的anchor box,分别为[1,1,2,1/2];在resnet的第七层会生成6种比例大小的anchor box[1, 1, 2 ,1/2, 3, 1/3]。这就是aspect_ratios的比例。
那么怎么生成anchor box?
anchors = []
for i in range(len(feature_heights)): # 这一步会遍历我们预设的6中aspect_ratios,分别为[4,6,6,6,4,4]
anchor_boxes = AnchorBox(input_shape, anchors_size[i], max_size = anchors_size[i+1],aspect_ratios = aspect_ratios[i]).call([feature_heights[i], feature_widths[i]])
anchors.append(anchor_boxes)# 这里产生的anchor box为38*38*4=5776 19*19*6=2166 600 150 36 4 总共为8732个框
class AnchorBox():
def __init__(self, input_shape, min_size, max_size=None, aspect_ratios=None, flip=True):
self.input_shape = input_shape
# 我们预设的anchors_size=[30, 60, 111, 162, 213, 264, 315]
# min_size 为anchors_size[i],则max_size 为anchor_size[i + 1]
self.min_size = min_size
self.max_size = max_size
self.aspect_ratios = []
for ar in aspect_ratios:
self.aspect_ratios.append(ar)
self.aspect_ratios.append(1.0 / ar)
def call(self, layer_shape, mask=None):
# --------------------------------- #
# 获取输入进来的特征层的宽和高
# 比如38x38
# --------------------------------- #
layer_height = layer_shape[0]
layer_width = layer_shape[1]
# --------------------------------- #
# 获取输入进来的图片的宽和高
# 比如300x300
# --------------------------------- #
img_height = self.input_shape[0]
img_width = self.input_shape[1]
box_widths = []
box_heights = []
# --------------------------------- #
# self.aspect_ratios一般有两个值
# [1, 1, 2, 1/2]
# [1, 1, 2, 1/2, 3, 1/3]
# --------------------------------- #
for ar in self.aspect_ratios:
# 首先添加一个较小的正方形
if ar == 1 and len(box_widths) == 0:
box_widths.append(self.min_size) # 从我们预设的anchors_size获取
box_heights.append(self.min_size)
# 然后添加一个较大的正方形
elif ar == 1 and len(box_widths) > 0:
box_widths.append(np.sqrt(self.min_size * self.max_size))
box_heights.append(np.sqrt(self.min_size * self.max_size))
# 然后添加长方形
elif ar != 1:
box_widths.append(self.min_size * np.sqrt(ar))
box_heights.append(self.min_size / np.sqrt(ar))
# --------------------------------- #
# 获得所有先验框的宽高1/2
# --------------------------------- #
box_widths = 0.5 * np.array(box_widths)
box_heights = 0.5 * np.array(box_heights)
# --------------------------------- #
# 每一个特征层对应的步长
# --------------------------------- #
step_x = img_width / layer_width
step_y = img_height / layer_height
# --------------------------------- #
# 生成网格中心
# --------------------------------- #
linx = np.linspace(0.5 * step_x, img_width - 0.5 * step_x,
layer_width)
liny = np.linspace(0.5 * step_y, img_height - 0.5 * step_y,
layer_height)
centers_x, centers_y = np.meshgrid(linx, liny)
centers_x = centers_x.reshape(-1, 1)
centers_y = centers_y.reshape(-1, 1)
# 每一个先验框需要两个(centers_x, centers_y),前一个用来计算左上角,后一个计算右下角
num_anchors_ = len(self.aspect_ratios)
anchor_boxes = np.concatenate((centers_x, centers_y), axis=1)
anchor_boxes = np.tile(anchor_boxes, (1, 2 * num_anchors_))
# 获得先验框的左上角和右下角
anchor_boxes[:, ::4] -= box_widths
anchor_boxes[:, 1::4] -= box_heights
anchor_boxes[:, 2::4] += box_widths
anchor_boxes[:, 3::4] += box_heights
# --------------------------------- #
# 将先验框变成小数的形式
# 归一化
# --------------------------------- #
anchor_boxes[:, ::2] /= img_width
anchor_boxes[:, 1::2] /= img_height
anchor_boxes = anchor_boxes.reshape(-1, 4)
anchor_boxes = np.minimum(np.maximum(anchor_boxes, 0.0), 1.0)
return anchor_boxes
第六层 第七层指的是(参考:https://blog.csdn.net/kui9702/article/details/123807917 中第6层,如下图)
之后我们来看SSD是如何计算loss,从上面模型的forward中可以看到,每一次前向传播会返回坐标回归预测结果和每一个anchor box对应的分类预测结果。
# 将SSD的forward获取的回归预测和分类预测拼接起来
y_pred = torch.cat([y_pred[0], nn.Softmax(-1)(y_pred[1])], dim = -1)
# 计算标注与预测的分类损失
conf_loss = self._softmax_loss(y_true[:, :, 4:-1], y_pred[:, :, 4:])
# 计算标注与预测的回归损失
loc_loss = self._l1_smooth_loss(y_true[:, :, :4], y_pred[:, :, :4])
# 计算预测的损失
# 计算预测正确的回归损失和分类损失,预测正确的loss将会加起来
pos_loc_loss = torch.sum(loc_loss * y_true[:, :, -1],
axis=1)
pos_conf_loss = torch.sum(conf_loss * y_true[:, :, -1],
axis=1)
# 获取每张图片的正样本数目
num_pos = torch.sum(y_true[:, :, -1], axis=-1)
# 获取一张图片会产生多少个样本(anchores 总和)
num_boxes = y_true.size()[1]
# 获取负样本的个数
num_neg = torch.min(self.neg_pos_ratio * num_pos, num_boxes - num_pos)
# 找到了哪些值是大于0的,记录下负样本的数量
pos_num_neg_mask = num_neg > 0
# 判断负样本的数量,如果负样本数量为0,则默认使用负样本数为100
has_min = torch.sum(pos_num_neg_mask)
num_neg_batch = torch.sum(num_neg) if has_min > 0 else self.negatives_for_hard
# 除了背景和物体的先验框,其他先眼眶 既不属于物体也不属于背景,属于较难分类的样本
confs_start = 4 + self.background_label_id + 1
confs_end = confs_start + self.num_classes - 1
# 对难分类的样本进行概率求和,求和后概率变大,代表越难分类
max_confs = torch.sum(y_pred[:, :, confs_start:confs_end], dim=2)
# 这里其实我不太懂,之后可能需要看论文其他资料
max_confs = (max_confs * (1 - y_true[:, :, -1])).view([-1])
_, indices = torch.topk(max_confs, k = int(num_neg_batch.cpu().numpy().tolist()))
neg_conf_loss = torch.gather(conf_loss.view([-1]), 0, indices)
# 求的loss
num_pos = torch.where(num_pos != 0, num_pos, torch.ones_like(num_pos))
total_loss = torch.sum(pos_conf_loss) + torch.sum(neg_conf_loss) + torch.sum(self.alpha * pos_loc_loss)
total_loss = total_loss / torch.sum(num_pos)
训练结果:与yolo v3和Faster RCNN对比(坐标SSD 右边 yolov3)
用的是与yolo v3一样的训练集,感觉比yolo v3要好,但是在小物体上,SSD难为力。我使用了vgg 和 resnet50 做SSDbackbone,提升比较明显,但是在小物体检测上,Resnet和vgg都表现的不好。
评论(0)
您还未登录,请登录后发表或查看评论