SSD

SSD是one-stage目标检测方法,和yolo一样。可以同时进行目标检测和分类,速度很快。

SSD主要流程:

选取合适的模型结构,挑选其中合适的特征层或者所有特征层作为backbone,再之后加上额外的卷积网络,组成SSD网络

选取其中的6层卷积层输出,对卷积层输出做2个操作。

坐标信息卷积处理:num_anchors x 4

分类信息卷积处理:num_anchors x num_classes

预测结果解码
具体代码可以参考CSDN@Bubbliiiing的代码,本次实现的是Resnet50实现SSD检测,这是为了与之前的Faster RCNN做对比,其他代码可以参考他的代码,我将我实现的Resnet 50 backbone贴出来,需要的小伙伴粘贴复制修改参考即可。

import torch.nn.functional as F
import torch
from torch import nn
from torch.nn import init
from torchvision.models import resnet50


class L2Norm(nn.Module):
    def __init__(self, n_channels, scale):
        super(L2Norm, self).__init__()
        self.n_channels = n_channels
        self.gamma = scale or None
        self.eps = 1e-10
        self.weight = nn.Parameter(torch.Tensor(self.n_channels))
        self.reset_parameters()

    def reset_parameters(self):
        init.constant_(self.weight, self.gamma)

    def forward(self, x):
        norm = x.pow(2).sum(dim=1, keepdim=True).sqrt() + self.eps
        # x /= norm
        x = torch.div(x, norm)
        out = self.weight.unsqueeze(0).unsqueeze(2).unsqueeze(3).expand_as(x) * x
        return out


def add_extras(in_channels):
    layers = []
    # Block 6
    # 19,19,1024 -> 10,10,512
    layers += [nn.Conv2d(in_channels, 512, kernel_size=3, stride=2, padding=1)]

    # Block 7
    # 10,10,512 -> 5,5,256
    layers += [nn.Conv2d(512, 256, kernel_size=3, stride=2, padding=1)]

    # Block 8
    # 5,5,256 -> 3,3,128
    layers += [nn.Conv2d(256, 128, kernel_size=3, stride=1)]

    # Block 9
    # 3,3,128 -> 1,1,256
    layers += [nn.Conv2d(128, 256, kernel_size=3, stride=1)]
    return nn.ModuleList(layers)

def resnet_backbone(pretrained=True, progress=True, **kwargs):
    resnet50_model = resnet50(pretrained=pretrained)
    model = nn.Sequential(*list(resnet50_model.children()))[:7]
    five_outchannels = resnet50_model.layer2[-1].conv3.out_channels
    six_outchannels = resnet50_model.layer3[-1].conv3.out_channels
    return model, five_outchannels, six_outchannels


class SSD_Resnet50(nn.Module):
    def __init__(self, num_classes,pretrained=False):
        super(SSD_Resnet50, self).__init__()
        self.num_classes = num_classes
        # 获取整体的特征和倒数两层的输出channels 是pytorch官方backbone的第6层作为多尺度卷积层第一层,第7层做为多尺度卷积层第二层
        self.features, six_outchannels, seven_outchannels = resnet_backbone(pretrained)
        # 添加4层的额外卷积层作为多尺度卷积层第三到第六层的输出
        self.extras = add_extras(seven_outchannels)
        self.L2Norm = L2Norm(1024, 20)
        mbox = [4, 6, 6, 6, 4, 4]
        loc_layers = []
        conf_layers = []
        # 获取多尺度卷积层第一层 第二层的坐标信息,分类信息
        for index, i in enumerate([six_outchannels, seven_outchannels]):
            loc_layers.append(nn.Conv2d(i, mbox[index] * 4, kernel_size=(3, 3), padding=1))
            conf_layers.append(nn.Conv2d(i, mbox[index] * num_classes, kernel_size=(3, 3), padding=1))
        # 获取多尺度卷积层第三到第六层的坐标信息,分类信息
        for k, v in enumerate(self.extras):
            k += 2
            loc_layers += [nn.Conv2d(v.out_channels, mbox[k] * 4, kernel_size=(3, 3), padding=1)]
            conf_layers += [nn.Conv2d(v.out_channels, mbox[k] * num_classes, kernel_size=(3, 3), padding=1)]
        self.loc = nn.ModuleList(loc_layers)
        self.conf = nn.ModuleList(conf_layers)

    def forward(self,x):
        sources = list()
        loc = list()
        conf = list()
        x = self.features[0](x)
        x = self.features[1](x)
        x = self.features[2](x)
        x = self.features[3](x)
        x = self.features[4](x)
        # 获取多尺度卷积层第一层的features输出 shape为39x39
        x = self.features[5](x)
        sources.append(x)
        # 获取多尺度卷积层第二的features输出 18x18
        x = self.features[6](x)
        # 将特征做归一化处理
        x = self.L2Norm(x)
        sources.append(x)
        # 获取多尺度卷积层剩下第三到第六层的features输出 10x10 5x5 3x3 1x1
        for k, v in enumerate(self.extras):
            x = F.relu(v(x), inplace=True)
            sources.append(x)
        for (x, l, c) in zip(sources, self.loc, self.conf):
            # 将多尺度卷积层输出的特征变化shape 将[batch, channels, h, w]转化为[batch, h, w, channels]
            loc.append(l(x).permute(0, 2, 3, 1).contiguous())
            conf.append(c(x).permute(0, 2, 3, 1).contiguous())

        # 将多尺度卷积层(6层)直接堆叠起来
        loc = torch.cat([o.view(o.size(0), -1) for o in loc], 1)
        conf = torch.cat([o.view(o.size(0), -1) for o in conf], 1)
        # -------------------------------------------------------------#
        #   loc会reshape到batch_size, num_anchors, 4
        #   conf会reshap到batch_size, num_anchors, self.num_classes
        # -------------------------------------------------------------#
        # 将loc、conf reshape为[batch_size, num_anchors, 4] [batch_size, num_anchors, num_classes]
        output = (
            loc.view(loc.size(0), -1, 4),
            conf.view(conf.size(0), -1, self.num_classes),
        )
        return output

if __name__ == '__main__':
    from torchsummary import summary
    model = SSD_Resnet50(10).cuda()
    print(summary(model,(3, 300, 300)))

Anchor box

对于Faster RCNN、yolov3、v4、v5都有专属的anchor box,SSD也不例外,首先在我们的多尺度卷积层,会输出mbox(4,6,6,6 ,4, 4)个aspect_ratios,比如说在resnet的第六层(38x38)的每个像素点输出4种大小比例的anchor box,分别为[1,1,2,1/2];在resnet的第七层会生成6种比例大小的anchor box[1, 1, 2 ,1/2, 3, 1/3]。这就是aspect_ratios的比例。

那么怎么生成anchor box?

anchors = []
    for i in range(len(feature_heights)):   # 这一步会遍历我们预设的6中aspect_ratios,分别为[4,6,6,6,4,4]
        anchor_boxes = AnchorBox(input_shape, anchors_size[i], max_size = anchors_size[i+1],aspect_ratios = aspect_ratios[i]).call([feature_heights[i], feature_widths[i]])
        anchors.append(anchor_boxes)# 这里产生的anchor box为38*38*4=5776 19*19*6=2166 600 150 36 4 总共为8732个框    
        
        
class AnchorBox():
    def __init__(self, input_shape, min_size, max_size=None, aspect_ratios=None, flip=True):
        self.input_shape = input_shape
        # 我们预设的anchors_size=[30, 60, 111, 162, 213, 264, 315]
        # min_size 为anchors_size[i],则max_size 为anchor_size[i + 1]
        self.min_size = min_size
        self.max_size = max_size

        self.aspect_ratios = []
        for ar in aspect_ratios:
            self.aspect_ratios.append(ar)
            self.aspect_ratios.append(1.0 / ar)

    def call(self, layer_shape, mask=None):
        # --------------------------------- #
        #   获取输入进来的特征层的宽和高
        #   比如38x38
        # --------------------------------- #
        layer_height    = layer_shape[0]
        layer_width     = layer_shape[1]
        # --------------------------------- #
        #   获取输入进来的图片的宽和高
        #   比如300x300
        # --------------------------------- #
        img_height  = self.input_shape[0]
        img_width   = self.input_shape[1]

        box_widths  = []
        box_heights = []
        # --------------------------------- #
        #   self.aspect_ratios一般有两个值
        #   [1, 1, 2, 1/2]
        #   [1, 1, 2, 1/2, 3, 1/3]
        # --------------------------------- #
        for ar in self.aspect_ratios:
            # 首先添加一个较小的正方形
            if ar == 1 and len(box_widths) == 0:
                box_widths.append(self.min_size)    # 从我们预设的anchors_size获取
                box_heights.append(self.min_size)
            # 然后添加一个较大的正方形
            elif ar == 1 and len(box_widths) > 0:
                box_widths.append(np.sqrt(self.min_size * self.max_size))
                box_heights.append(np.sqrt(self.min_size * self.max_size))
            # 然后添加长方形
            elif ar != 1:
                box_widths.append(self.min_size * np.sqrt(ar))
                box_heights.append(self.min_size / np.sqrt(ar))

        # --------------------------------- #
        #   获得所有先验框的宽高1/2
        # --------------------------------- #
        box_widths  = 0.5 * np.array(box_widths)
        box_heights = 0.5 * np.array(box_heights)

        # --------------------------------- #
        #   每一个特征层对应的步长
        # --------------------------------- #
        step_x = img_width / layer_width
        step_y = img_height / layer_height

        # --------------------------------- #
        #   生成网格中心
        # --------------------------------- #
        linx = np.linspace(0.5 * step_x, img_width - 0.5 * step_x,
                           layer_width)
        liny = np.linspace(0.5 * step_y, img_height - 0.5 * step_y,
                           layer_height)
        centers_x, centers_y = np.meshgrid(linx, liny)
        centers_x = centers_x.reshape(-1, 1)
        centers_y = centers_y.reshape(-1, 1)

        # 每一个先验框需要两个(centers_x, centers_y),前一个用来计算左上角,后一个计算右下角
        num_anchors_ = len(self.aspect_ratios)
        anchor_boxes = np.concatenate((centers_x, centers_y), axis=1)
        anchor_boxes = np.tile(anchor_boxes, (1, 2 * num_anchors_))
        # 获得先验框的左上角和右下角
        anchor_boxes[:, ::4]    -= box_widths
        anchor_boxes[:, 1::4]   -= box_heights
        anchor_boxes[:, 2::4]   += box_widths
        anchor_boxes[:, 3::4]   += box_heights

        # --------------------------------- #
        #   将先验框变成小数的形式
        #   归一化
        # --------------------------------- #
        anchor_boxes[:, ::2]    /= img_width
        anchor_boxes[:, 1::2]   /= img_height
        anchor_boxes = anchor_boxes.reshape(-1, 4)

        anchor_boxes = np.minimum(np.maximum(anchor_boxes, 0.0), 1.0)
        return anchor_boxes

第六层 第七层指的是(参考:https://blog.csdn.net/kui9702/article/details/123807917 中第6层,如下图)

image-20220418224018747

之后我们来看SSD是如何计算loss,从上面模型的forward中可以看到,每一次前向传播会返回坐标回归预测结果和每一个anchor box对应的分类预测结果。

# 将SSD的forward获取的回归预测和分类预测拼接起来
y_pred = torch.cat([y_pred[0], nn.Softmax(-1)(y_pred[1])], dim = -1)

# 计算标注与预测的分类损失
conf_loss = self._softmax_loss(y_true[:, :, 4:-1], y_pred[:, :, 4:])

# 计算标注与预测的回归损失
loc_loss = self._l1_smooth_loss(y_true[:, :, :4], y_pred[:, :, :4])

# 计算预测的损失
# 计算预测正确的回归损失和分类损失,预测正确的loss将会加起来
pos_loc_loss = torch.sum(loc_loss * y_true[:, :, -1],
                                     axis=1)
pos_conf_loss = torch.sum(conf_loss * y_true[:, :, -1],
                                      axis=1)

# 获取每张图片的正样本数目
num_pos = torch.sum(y_true[:, :, -1], axis=-1)

# 获取一张图片会产生多少个样本(anchores 总和)
num_boxes = y_true.size()[1]
# 获取负样本的个数
num_neg = torch.min(self.neg_pos_ratio * num_pos, num_boxes - num_pos)
# 找到了哪些值是大于0的,记录下负样本的数量
pos_num_neg_mask = num_neg > 0
# 判断负样本的数量,如果负样本数量为0,则默认使用负样本数为100
has_min = torch.sum(pos_num_neg_mask)
num_neg_batch = torch.sum(num_neg) if has_min > 0 else self.negatives_for_hard

# 除了背景和物体的先验框,其他先眼眶 既不属于物体也不属于背景,属于较难分类的样本
confs_start = 4 + self.background_label_id + 1
confs_end = confs_start + self.num_classes - 1
# 对难分类的样本进行概率求和,求和后概率变大,代表越难分类
max_confs = torch.sum(y_pred[:, :, confs_start:confs_end], dim=2)
# 这里其实我不太懂,之后可能需要看论文其他资料
max_confs   = (max_confs * (1 - y_true[:, :, -1])).view([-1])
_, indices  = torch.topk(max_confs, k = int(num_neg_batch.cpu().numpy().tolist()))
neg_conf_loss = torch.gather(conf_loss.view([-1]), 0, indices)

# 求的loss
num_pos = torch.where(num_pos != 0, num_pos, torch.ones_like(num_pos))
total_loss  = torch.sum(pos_conf_loss) + torch.sum(neg_conf_loss) + torch.sum(self.alpha * pos_loc_loss)
total_loss  = total_loss / torch.sum(num_pos)

训练结果:与yolo v3和Faster RCNN对比(坐标SSD 右边 yolov3)

在这里插入图片描述

在这里插入图片描述

在这里插入图片描述

用的是与yolo v3一样的训练集,感觉比yolo v3要好,但是在小物体上,SSD难为力。我使用了vgg 和 resnet50 做SSDbackbone,提升比较明显,但是在小物体检测上,Resnet和vgg都表现的不好。