SSD

背景介绍

SSD(Single Shot MultiBox Detector):于2016年发表在ECCV上。Single Shot MultiBox Detector的字面意思为：单次多框检测器，顾名思义，属于目标检测算法中一步法的思想，而且利用到多个先验框的一种算法，是一步法的典型代表。

SSD

SSD特点

特征提取网络为VGG，构建特征提取网络较为简单。
针对于不同尺度的特征层设计不同大小的先验框，融合不同特征层的检测信息对先验框中是否包含物体进行分类。

SSD图像分析

SSD

TensorFlow2.0实现

from functools import reduce
import numpy as np
import tensorflow as tf
import tensorflow.keras as keras


def compose(*funcs):
    if funcs:
        return reduce(lambda f, g: lambda *a, **kw: g(f(*a, **kw)), funcs)
    else:
        raise ValueError('Composition of empty sequence not supported.')


class L2_Normalize(keras.layers.Layer):
    def __init__(self, scale, **kwargs):
        super(L2_Normalize, self).__init__(kwargs)
        self.scale = scale

    def build(self, input_shape):
        self.gamma = tf.Variable(self.scale * np.ones((input_shape[3],), dtype='float32'))

    def call(self, x, mask=None):
        output = tf.nn.l2_normalize(x, axis=3)
        output *= self.gamma

        return output


def ssd(input_shape):
    input_tensor = keras.layers.Input(input_shape, name='input')
    x = input_tensor

    x = compose(keras.layers.Conv2D(64, (3, 3), (1, 1), 'same', activation='relu', name='conv1_1'),
                keras.layers.Conv2D(64, (3, 3), (1, 1), 'same', activation='relu', name='conv1_2'),
                keras.layers.MaxPool2D((2, 2), (2, 2), 'same', name='maxpool1'))(x)

    x = compose(keras.layers.Conv2D(128, (3, 3), (1, 1), 'same', activation='relu', name='conv2_1'),
                keras.layers.Conv2D(128, (3, 3), (1, 1), 'same', activation='relu', name='conv2_2'),
                keras.layers.MaxPool2D((2, 2), (2, 2), 'same', name='maxpool2'))(x)

    x = compose(keras.layers.Conv2D(256, (3, 3), (1, 1), 'same', activation='relu', name='conv3_1'),
                keras.layers.Conv2D(256, (3, 3), (1, 1), 'same', activation='relu', name='conv3_2'),
                keras.layers.Conv2D(256, (3, 3), (1, 1), 'same', activation='relu', name='conv3_3'),
                keras.layers.MaxPool2D((2, 2), (2, 2), 'same', name='maxpool3'))(x)

    x = compose(keras.layers.Conv2D(512, (3, 3), (1, 1), 'same', activation='relu', name='conv4_1'),
                keras.layers.Conv2D(512, (3, 3), (1, 1), 'same', activation='relu', name='conv4_2'),
                keras.layers.Conv2D(512, (3, 3), (1, 1), 'same', activation='relu', name='conv4_3'))(x)

    l2_norm = L2_Normalize(20, name='l2_norm')(x)

    feature1_reg = compose(keras.layers.Conv2D(4 * 4, (3, 3), (1, 1), 'same', name='feature1_reg_conv'),
                           keras.layers.Flatten(name='feature1_reg_flatten'))(l2_norm)
    feature1_cls = compose(keras.layers.Conv2D(4 * 21, (3, 3), (1, 1), 'same', name='feature1_cls_conv'),
                           keras.layers.Flatten(name='feature1_cls_flatten'))(l2_norm)

    x = compose(keras.layers.MaxPool2D((2, 2), (2, 2), 'same', name='maxpool4'),
                keras.layers.Conv2D(512, (3, 3), (1, 1), 'same', activation='relu', name='conv5_1'),
                keras.layers.Conv2D(512, (3, 3), (1, 1), 'same', activation='relu', name='conv5_2'),
                keras.layers.Conv2D(512, (3, 3), (1, 1), 'same', activation='relu', name='conv5_3'),
                keras.layers.MaxPool2D((3, 3), (1, 1), 'same', name='maxpool5'),
                keras.layers.Conv2D(1024, (3, 3), (1, 1), 'same', activation='relu', dilation_rate=(6, 6), name='conv5_4'),
                keras.layers.Conv2D(1024, (1, 1), (1, 1), 'same', activation='relu', name='conv5_5'))(x)

    feature2_reg = compose(keras.layers.Conv2D(6 * 4, (3, 3), (1, 1), 'same', name='feature2_reg_conv'),
                           keras.layers.Flatten(name='feature2_reg_flatten'))(x)
    feature2_cls = compose(keras.layers.Conv2D(6 * 21, (3, 3), (1, 1), 'same', name='feature2_cls_conv'),
                           keras.layers.Flatten(name='feature2_cls_flatten'))(x)

    x = compose(keras.layers.Conv2D(256, (1, 1), (1, 1), 'same', activation='relu', name='conv6_1'),
                keras.layers.Conv2D(512, (3, 3), (2, 2), 'same', activation='relu', name='conv6_2'))(x)

    feature3_reg = compose(keras.layers.Conv2D(6 * 4, (3, 3), (1, 1), 'same', name='feature3_reg_conv'),
                           keras.layers.Flatten(name='feature3_reg_flatten'))(x)
    feature3_cls = compose(keras.layers.Conv2D(6 * 21, (3, 3), (1, 1), 'same', name='feature3_cls_conv'),
                           keras.layers.Flatten(name='feature3_cls_flatten'))(x)

    x = compose(keras.layers.Conv2D(128, (1, 1), (1, 1), 'same', activation='relu', name='conv7_1'),
                keras.layers.Conv2D(256, (3, 3), (2, 2), 'same', activation='relu', name='conv7_2'))(x)

    feature4_reg = compose(keras.layers.Conv2D(6 * 4, (3, 3), (1, 1), 'same', name='feature4_reg_conv'),
                           keras.layers.Flatten(name='feature4_reg_flatten'))(x)
    feature4_cls = compose(keras.layers.Conv2D(6 * 21, (3, 3), (1, 1), 'same', name='feature4_cls_conv'),
                           keras.layers.Flatten(name='feature4_cls_flatten'))(x)

    x = compose(keras.layers.Conv2D(128, (1, 1), (1, 1), 'same', activation='relu', name='conv8_1'),
                keras.layers.Conv2D(256, (3, 3), (1, 1), 'valid', activation='relu', name='conv8_2'))(x)

    feature5_reg = compose(keras.layers.Conv2D(4 * 4, (3, 3), (1, 1), 'same', name='feature5_reg_conv'),
                           keras.layers.Flatten(name='feature5_reg_flatten'))(x)
    feature5_cls = compose(keras.layers.Conv2D(4 * 21, (3, 3), (1, 1), 'same', name='feature5_cls_conv'),
                           keras.layers.Flatten(name='feature5_cls_flatten'))(x)

    x = compose(keras.layers.Conv2D(128, (1, 1), (1, 1), 'same', activation='relu', name='conv9_1'),
                keras.layers.Conv2D(256, (3, 3), (1, 1), 'valid', activation='relu', name='conv9_2'))(x)

    feature6_reg = compose(keras.layers.Conv2D(4 * 4, (3, 3), (1, 1), 'same', name='feature6_reg_conv'),
                           keras.layers.Flatten(name='feature6_reg_flatten'))(x)
    feature6_cls = compose(keras.layers.Conv2D(4 * 21, (3, 3), (1, 1), 'same', name='feature6_cls_conv'),
                           keras.layers.Flatten(name='feature6_cls_flatten'))(x)

    concatenate_reg = keras.layers.Concatenate(name='concatenate_reg')([feature1_reg, feature2_reg, feature3_reg, feature4_reg, feature5_reg, feature6_reg])
    concatenate_cls = keras.layers.Concatenate(name='concatenate_cls')([feature1_cls, feature2_cls, feature3_cls, feature4_cls, feature5_cls, feature6_cls])

    reshape_reg = keras.layers.Reshape((8732, 4), name='reshape_reg')(concatenate_reg)
    reshape_cls = keras.layers.Reshape((8732, 21), name='reshape_cls')(concatenate_cls)

    softmax_cls = keras.layers.Softmax(name='softmax_cls')(reshape_cls)

    output = keras.layers.Concatenate(name='concatenate')([reshape_reg, softmax_cls])

    model = keras.Model(input_tensor, output, name='SSD')

    return model


if __name__ == '__main__':

    model = ssd(input_shape=(300, 300, 3))
    model.build(input_shape=(None, 300, 300, 3))
    model.summary()

SSD

Shape数据集完整实战

文件路径关系说明

project
- shape
  - train_imgs(训练集图像文件夹)
  - annotations(训练集标签文件夹)
  - test_imgs(测试集图像文件夹)
- SSD_weight(模型权重文件夹)
- SSD_test_result(测试集结果文件夹)
- SSD.py

实战步骤说明

目标检测和语义分割是两种不同类型的工程项目，目标检测实战处理比语义分割困难的多，首先要读取真实框信息，将其保存下来，为了后面编码使用。
建立先验框，根据网络结构，在不同特征层上建立不同的先验框，先验框的总个数为每个回归分类特征层的像素点个数x每个像素点上的先验框个数。以论文中的先验框为例，特征层有6个，大小分别为38x38，19x19，10x10，5x5，3x3，1x1，特征层上每个像素点的先验框个数分别为4，6，6，6，4，4。

$$ 38^2 \times 4+19^2 \times 6+10^2 \times 6+5^2 \times 6+3^2 \times 4+1^2 \times 4=8732 $$
故先验框总数为8732个。
根据真实框的信息，和所有先验框计算IOU，将IOU大于设定值的记录下来，作为正样本。然后进行编码，在所属类别的置信度上面置1，其他类别置信度置0，并计算正样本先验框的中心坐标与宽高和真实框的中心坐标与宽高之间的差异。输出(batch_size, num_prior, 4 + 1 + num_class)，num_prior为先验框的个数，每个先验框有4 + 1 + num_class个值，4代表中心坐标和宽高相对真实框的差异，1代表属于背景的置信度，num_class代表属于某一个类别的置信度。编码的目的是得到真实框对应的神经网络的输出应该是什么样子，然后让两者尽可能的接近。

IOU(Intersection Over Union，交并比)：用于评估语义分割算法性能的指标是平均IOU，交并比也非常好理解，算法的结果与真实物体进行交运算的结果除以进行并运算的结果。通过下图可以直观的看出IOU的计算方法。
IOU
4. 设计损失函数，因为先验框中大部分都是负样本，因此不能直接计算损失函数，首先要对正负样本进行比例调整。一般选择正负样本比例为1：3，然后使用交叉熵损失函数计算正负样本的分类损失，使用smooth L1 loss计算正样本的定位损失。
5. 搭建神经网络，设置合适参数，进行训练。
6. 预测时，需要根据神经网络的输出进行逆向解码(编码的反过程)，根据置信度，选择非背景置信度大于设定值的先验框作为候选框，并且该框的类别设为置信度最大索引对应的类别，如最大值的索引为2，则该预测框预测的物体类别是第二类。然后根据先验框的坐标和4个回归参数确定候选框的左上角和右下角坐标。对每一类候选框进行NMS得到预测框，并且在图像上画出预测框，并且标出置信度即可完成目标检测任务。
NMS(Non-Maximum Suppression，非极大值抑制)：简单地说，不是最大的我不要，在目标检测中，往往图像上存在大量先验框，会导致很多附近的框都会预测出同一个物体，但是我们只保留最大的一个预测结果，这就是非极大值抑制。
步骤：
(1)从最大概率矩形框F开始，分别判断A~E与F的IOU是否大于某个设定的阈值，假设B、D与F的重叠度超过阈值，那么就扔掉B、D；并标记第一个矩形框F，是我们保留下来的。
(2)从剩下的矩形框A、C、E中，选择概率最大的E，然后判断E与A、C的重叠度，重叠度大于一定的阈值，那么就扔掉；并标记E是我们保留下来的第二个矩形框。
(3)重复步骤(2)，直到所有的框都被抛弃或者保留。
NMS

小技巧

神经网络的输出维度为(batch_size, num_prior, 4 + 1 + num_class)，此数据集为3类，因此最后一个维度是8。每个先验框有8个索引，前面4个索引代表先验框的回归参数，用来对先验框进行调整得到预测框，索引为4代表背景，索引为5代表圆形，索引为6代表三角形，索引为7代表正方形。
实际的工程应用中，常常还需要对数据集进行大小调整和增强，在这里为了简单起见，没有进行复杂的操作，小伙伴们应用中要记得根据自己的需要，对图像进行resize或者padding，然后旋转，对比度增强，仿射运算等等操作，增加模型的鲁棒性，并且实际中的图像不一定按照顺序命名的，因此应用中也要注意图像读取的文件名。
设置了权重的保存方式，学习率的下降方式和早停方式。
使用yield关键字，产生可迭代对象，不用将所有的数据都保存下来，大大节约内存。
其中将1000个数据，分成800个训练集，100个验证集和100个测试集，小伙伴们可以自行修改。
注意其中的一些维度变换和numpy，tensorflow常用操作，否则在阅读代码时可能会产生一些困难。
SSD的特征提取网络为VGG，小伙伴们可以参考特征提取网络部分内容，选择其他的网络进行特征提取，比较不同网络参数量，运行速度，最终结果之间的差异。
图像输入可以先将其归一化到0-1之间或者-1-1之间，因为网络的参数一般都比较小，所以归一化后计算方便，收敛较快。
根据实际的图像大小，选择合适的特征层数，先验框的形状，先验框数量，以及各种阈值
anchor尺寸的确定，anchor一般是正方形或者长方形，每个特征层上设置最大尺寸max_size和最小尺寸min_size，如果先验框为4个，则代表两个正方形和两个长方形，一个正方形的边长为min_size，另一个为$\sqrt{max \underline{} size \times min \underline{} size}$，一个长方形的边长为$(min \underline{} size \times \sqrt2，min \underline{} size \div \sqrt2)$，另一个长方形的边长为$(min \underline{} size \div \sqrt2，min \underline{} size \times \sqrt2)$，如果先验框为6个，则添加两个长方形，将上面的$\sqrt2$改成$\sqrt3$即可。
因为这个博客是对学习的一些总结和记录，意在和学习者探讨和交流，并且给准备入门的同学一些手把手的教学，因此关于目标检测的算法参数设计，我都是自己尝试的，不是针对于这个数据集最优的参数，大家可以根据自己的实际需要修改网络结构。

完整实战代码

import colorsys
import os
import xml.etree.ElementTree as ET
from functools import reduce
import numpy as np
import cv2 as cv
import tensorflow as tf
import tensorflow.keras as keras


def compose(*funcs):
    if funcs:
        return reduce(lambda f, g: lambda *a, **kw: g(f(*a, **kw)), funcs)
    else:
        raise ValueError('Composition of empty sequence not supported.')


# 获取先验框函数
def get_prior(layer_id):
    layer_id = layer_id - 1

    box_widths = []
    box_heights = []

    current_ratios = [1, 1]
    for ratio in ratios[layer_id]:
        current_ratios.extend([ratio, 1 / ratio])

    for ratio in current_ratios:
        if ratio == 1 and len(box_widths) == 0:
            box_widths.append(min_size[layer_id])
            box_heights.append(min_size[layer_id])
        elif ratio == 1 and len(box_widths) > 0:
            box_widths.append((min_size[layer_id] * max_size[layer_id]) ** 0.5)
            box_heights.append((min_size[layer_id] * max_size[layer_id]) ** 0.5)
        elif ratio != 1:
            box_widths.append(min_size[layer_id] * ratio ** 0.5)
            box_heights.append(min_size[layer_id] / ratio ** 0.5)

    step_x = img_size[1] / feature_map[layer_id]
    step_y = img_size[0] / feature_map[layer_id]
    linx = np.linspace(0.5 * step_x, img_size[1] - 0.5 * step_x, feature_map[layer_id])
    liny = np.linspace(0.5 * step_y, img_size[0] - 0.5 * step_y, feature_map[layer_id])

    centers_x, centers_y = np.meshgrid(linx, liny)
    centers_x = centers_x.reshape(-1, 1)
    centers_y = centers_y.reshape(-1, 1)

    # 获得先验框的中心坐标
    prior_center = np.concatenate((centers_x, centers_y), axis=1)
    prior_center = np.tile(prior_center, (1, prior[layer_id] * 2))

    prior_lt_rb = prior_center.copy()

    # 获得先验框的左上右下
    prior_lt_rb[:, ::4] -= box_widths
    prior_lt_rb[:, 1::4] -= box_heights
    prior_lt_rb[:, 2::4] += box_widths
    prior_lt_rb[:, 3::4] += box_heights

    # 归一化到[0, 1]
    prior_lt_rb[:, ::2] /= img_size[1]
    prior_lt_rb[:, 1::2] /= img_size[0]
    prior_lt_rb = prior_lt_rb.reshape(-1, 4)
    prior_lt_rb = np.minimum(np.maximum(prior_lt_rb, 0.0), 1.0)

    prior_center_wh = np.zeros_like(prior_lt_rb)
    # 获得先验框的宽和高
    prior_center_wh[:, 0] = 0.5 * (prior_lt_rb[:, 2] + prior_lt_rb[:, 0])
    prior_center_wh[:, 1] = 0.5 * (prior_lt_rb[:, 3] + prior_lt_rb[:, 1])
    prior_center_wh[:, 2] = prior_lt_rb[:, 2] - prior_lt_rb[:, 0]
    prior_center_wh[:, 3] = prior_lt_rb[:, 3] - prior_lt_rb[:, 1]

    return prior_center_wh.astype(np.float32), prior_lt_rb.astype(np.float32)


# 从xml文件中获取bounding-box信息
def get_bbox(image_id, bbox_path, annotations_path):
    with open(bbox_path, 'w') as f:
        for id in image_id:
            # 图片路径
            info = os.getcwd() + imgs_path[1:] + '\\' + str(id) + '.jpg'
            in_file = open(annotations_path + '\\' + str(id) + '.xml', encoding='utf-8')
            tree = ET.parse(in_file)
            root = tree.getroot()

            for obj in root.iter('object'):
                difficult = obj.find('difficult').text
                cls = obj.find('name').text
                if cls not in classes or int(difficult) == 1:
                    continue
                cls_id = classes.index(cls)
                xmlbox = obj.find('bndbox')
                b = (int(xmlbox.find('xmin').text), int(xmlbox.find('ymin').text), int(xmlbox.find('xmax').text), int(xmlbox.find('ymax').text))
                info += " " + ",".join([str(x) for x in b]) + ',' + str(cls_id)
            f.writelines(info + '\n')


class L2_Normalize(keras.layers.Layer):
    def __init__(self, scale, **kwargs):
        super(L2_Normalize, self).__init__(kwargs)
        self.scale = scale

    def build(self, input_shape):
        self.gamma = tf.Variable(self.scale * np.ones((input_shape[3],), dtype='float32'))

    def call(self, x, mask=None):
        output = tf.nn.l2_normalize(x, axis=3)
        output *= self.gamma

        return output


def small_ssd(input_shape):
    input_tensor = keras.layers.Input(input_shape, name='input')
    x = input_tensor

    x = compose(keras.layers.Conv2D(32, (3, 3), (1, 1), 'same', activation='relu', name='conv1_1'),
                keras.layers.MaxPool2D((2, 2), (2, 2), 'same', name='maxpool1'))(x)

    x = compose(keras.layers.Conv2D(64, (3, 3), (1, 1), 'same', activation='relu', name='conv2_1'),
                keras.layers.MaxPool2D((2, 2), (2, 2), 'same', name='maxpool2'))(x)

    x = keras.layers.Conv2D(128, (3, 3), (1, 1), 'same', activation='relu', name='conv3_1')(x)

    l2_norm = L2_Normalize(20, name='l2_norm')(x)

    feature1_reg = compose(keras.layers.Conv2D(prior[0] * 4, (3, 3), (1, 1), 'same', name='feature1_reg_conv'),
                           keras.layers.Flatten(name='feature1_reg_flatten'))(l2_norm)
    feature1_cls = compose(keras.layers.Conv2D(prior[0] * num_class, (3, 3), (1, 1), 'same', name='feature1_cls_conv'),
                           keras.layers.Flatten(name='feature1_cls_flatten'))(l2_norm)

    x = compose(keras.layers.MaxPool2D((2, 2), (2, 2), 'same', name='maxpool3'),
                keras.layers.Conv2D(256, (3, 3), (1, 1), 'same', activation='relu', name='conv4_1'),)(x)

    feature2_reg = compose(keras.layers.Conv2D(prior[1] * 4, (3, 3), (1, 1), 'same', name='feature2_reg_conv'),
                           keras.layers.Flatten(name='feature2_reg_flatten'))(x)
    feature2_cls = compose(keras.layers.Conv2D(prior[1] * num_class, (3, 3), (1, 1), 'same', name='feature2_cls_conv'),
                           keras.layers.Flatten(name='feature2_cls_flatten'))(x)

    x = compose(keras.layers.Conv2D(512, (3, 3), (2, 2), 'valid', activation='relu', name='conv5_1'))(x)

    feature3_reg = compose(keras.layers.Conv2D(prior[2] * 4, (3, 3), (1, 1), 'same', name='feature3_reg_conv'),
                           keras.layers.Flatten(name='feature3_reg_flatten'))(x)
    feature3_cls = compose(keras.layers.Conv2D(prior[2] * num_class, (3, 3), (1, 1), 'same', name='feature3_cls_conv'),
                           keras.layers.Flatten(name='feature3_cls_flatten'))(x)

    x = compose(keras.layers.Conv2D(128, (1, 1), (1, 1), 'same', activation='relu', name='conv6_1'),
                keras.layers.Conv2D(256, (3, 3), (2, 2), 'valid', activation='relu', name='conv6_2'))(x)

    feature4_reg = compose(keras.layers.Conv2D(prior[3] * 4, (3, 3), (1, 1), 'same', name='feature4_reg_conv'),
                           keras.layers.Flatten(name='feature4_reg_flatten'))(x)
    feature4_cls = compose(keras.layers.Conv2D(prior[3] * num_class, (3, 3), (1, 1), 'same', name='feature4_cls_conv'),
                           keras.layers.Flatten(name='feature4_cls_flatten'))(x)

    x = compose(keras.layers.Conv2D(128, (1, 1), (1, 1), 'same', activation='relu', name='conv7_1'),
                keras.layers.Conv2D(256, (3, 3), (1, 1), 'valid', activation='relu', name='conv7_2'))(x)

    feature5_reg = compose(keras.layers.Conv2D(prior[4] * 4, (3, 3), (1, 1), 'same', name='feature5_reg_conv'),
                           keras.layers.Flatten(name='feature5_reg_flatten'))(x)
    feature5_cls = compose(keras.layers.Conv2D(prior[4] * num_class, (3, 3), (1, 1), 'same', name='feature5_cls_conv'),
                           keras.layers.Flatten(name='feature5_cls_flatten'))(x)

    concatenate_reg = keras.layers.Concatenate(name='concatenate_reg')([feature1_reg, feature2_reg, feature3_reg, feature4_reg, feature5_reg])
    concatenate_cls = keras.layers.Concatenate(name='concatenate_cls')([feature1_cls, feature2_cls, feature3_cls, feature4_cls, feature5_cls])

    reshape_reg = keras.layers.Reshape((num_prior, 4), name='reshape_reg')(concatenate_reg)
    reshape_cls = keras.layers.Reshape((num_prior, num_class), name='reshape_cls')(concatenate_cls)

    softmax_cls = keras.layers.Softmax(name='softmax_cls')(reshape_cls)

    output = keras.layers.Concatenate(name='concatenate')([reshape_reg, softmax_cls])

    model = keras.Model(input_tensor, output, name='Small_SSD')

    return model


# 计算IOU函数
def iou(box):
    inter_upleft = np.maximum(prior_lt_rb[:, :2], box[:2])
    inter_botright = np.minimum(prior_lt_rb[:, 2:4], box[2:])

    inter_wh = inter_botright - inter_upleft
    inter_wh = np.maximum(inter_wh, 0)
    inter = inter_wh[:, 0] * inter_wh[:, 1]
    # 真实框的面积
    area_true = (box[2] - box[0]) * (box[3] - box[1])
    # 先验框的面积
    area_gt = (prior_lt_rb[:, 2] - prior_lt_rb[:, 0]) * (prior_lt_rb[:, 3] - prior_lt_rb[:, 1])
    # 计算iou
    union = area_true + area_gt - inter

    iou = inter / union

    return iou


# 根据真实框bounding-box编码函数
def encoder(box):
    iou_val = iou(box)
    encoded_box = np.zeros((num_prior, 5))

    # 找到每一个真实框，重合程度较高的先验框
    assign_mask = iou_val > overlap_threshold
    encoded_box[:, -1][assign_mask] = iou_val[assign_mask]

    # 找到对应的先验框
    assigned_priors = prior_center_wh[assign_mask]

    # 先计算真实框的中心与长宽
    box_center = 0.5 * (box[:2] + box[2:])
    box_wh = box[2:] - box[:2]

    # 再计算重合度较高的先验框的中心与长宽
    assigned_priors_center = assigned_priors[:, :2]
    assigned_priors_wh = assigned_priors[:, 2:4]

    # 根据真实框求ssd应该有的预测结果
    encoded_box[:, :2][assign_mask] = box_center - assigned_priors_center
    encoded_box[:, :2][assign_mask] /= assigned_priors_wh

    # 除以0.1
    encoded_box[:, :2][assign_mask] /= variances[:2]

    encoded_box[:, 2:4][assign_mask] = np.log(box_wh / assigned_priors_wh)
    # 除以0.2
    encoded_box[:, 2:4][assign_mask] /= variances[2:]

    return encoded_box


# 获取网络输出标签数据，即作为损失函数的真实输入y_true
def assign_boxes(boxes):
    # 大小为num_box * (4 + num_class)，4代表4个位置回归
    assignment = np.zeros((num_prior, 4 + num_class))
    assignment[:, 4] = 1.0
    if len(boxes) == 0:
        return assignment
    # 对每一个真实框都进行iou计算
    encoded_boxes = np.apply_along_axis(f_encode, 1, boxes[:, :4])
    # 每一个真实框的编码后的值，和iou
    encoded_boxes = encoded_boxes.reshape(-1, num_prior, 5)
    # 取重合程度最大的先验框，并且获取这个先验框的index
    best_iou = encoded_boxes[:, :, -1].max(axis=0)
    best_iou_idx = encoded_boxes[:, :, -1].argmax(axis=0)
    best_iou_mask = best_iou > 0
    best_iou_idx = best_iou_idx[best_iou_mask]

    # 保留重合程度最大的先验框的应该有的预测结果
    encoded_boxes = encoded_boxes[:, best_iou_mask, :]
    assignment[:, :4][best_iou_mask] = encoded_boxes[best_iou_idx, np.arange(len(best_iou_idx)), :4]
    # 4代表为背景的概率，为0
    assignment[:, 4:][best_iou_mask] = boxes[best_iou_idx, 4:]
    # 通过assign_boxes我们就获得了，输入进来的这张图片，应该有的预测结果是什么样子的
    return assignment


# 通过yield获取可迭代对象
def generate_arrays_from_file(train_data, batch_size):
    # 获取总长度
    n = len(train_data)
    i = 0
    while True:
        X_train = []
        Y_train = []
        # 获取一个batch_size大小的数据
        while len(X_train) < batch_size:
            if i == 0:
                np.random.shuffle(train_data)
            # 从文件中读取图像
            # train_data[i] = 2
            img = cv.imread(imgs_path + '\\' + str(train_data[i]) + '.jpg')
            # print(str(train_data[i]))
            img = img / 127.5 - 1
            info = np.array([list(map(int, x.split(','))) for x in bounding_info[train_data[i]].split()[3:]])
            if not len(info):
                i = (i + 1) % n
                continue
            box = (info[:, :4] + 1).astype(np.float32)
            box[:, [0, 2]] = box[:, [0, 2]] / img_size[1]
            box[:, [1, 3]] = box[:, [1, 3]] / img_size[0]
            label = np.eye(num_class)[np.array(info[:, 4] + 1, np.int32)]
            if ((box[:, 0] - box[:, 2]) >= 0).any() or ((box[:, 1] - box[:, 3]) >= 0).any():
                i = (i + 1) % n
                continue
            box = np.concatenate([box, label], axis=-1)
            X_train.append(img)
            y = assign_boxes(box)
            Y_train.append(y)
            i = (i + 1) % n
        yield tf.constant(X_train), tf.constant(Y_train)


# 定义损失函数
class Loss:
    def l1_smooth_loss(self, y_true, y_pred):
        abs_loss = tf.abs(y_true - y_pred)
        sq_loss = 0.5 * (y_true - y_pred) ** 2
        l1_loss = tf.where(tf.less(abs_loss, 1.0), sq_loss, abs_loss - 0.5)
        return tf.reduce_sum(l1_loss, axis=-1)

    def softmax_loss(self, y_true, y_pred):
        y_pred = tf.maximum(y_pred, 1e-7)
        softmax_loss = -tf.reduce_sum(y_true * tf.math.log(y_pred), axis=-1)
        return softmax_loss

    def compute_loss(self, y_true, y_pred):
        # 每一张图的pos的个数，shape为batch_size
        y_pos = 1 - y_true[:, :, 4]
        num_pos = tf.reduce_sum(y_pos, axis=-1)
        # 获取一定的负样本
        num_neg = tf.minimum(neg_pos_ratio * num_pos, num_prior - num_pos)
        # 找到了哪些值是大于0的
        pos_num_neg_mask = tf.greater(num_neg, 0)
        # 求平均每个图片要取多少个负样本
        has_min = tf.cast(tf.reduce_any(pos_num_neg_mask), tf.float32)
        num_neg = tf.concat([num_neg, [(1 - has_min) * negatives_for_hard]], axis=0)
        num_neg_batch = tf.reduce_mean(tf.boolean_mask(num_neg, tf.greater(num_neg, 0)))
        num_neg_batch = tf.cast(num_neg_batch, tf.int32)

        # 找到实际上在该位置不应该有预测结果的框，求他们最大的置信度。
        max_confs = tf.reduce_max(y_pred[:, :, 5:5 + num_class - 1], axis=2)

        # 取top_k个置信度，作为负样本
        _, indices = tf.nn.top_k(max_confs * y_true[:, :, 4], k=num_neg_batch)

        # 找到负样本的一维索引
        batch_idx = tf.expand_dims(tf.range(0, batch_size), 1)
        batch_idx = tf.tile(batch_idx, (1, num_neg_batch))
        full_indices = (tf.reshape(batch_idx, [-1]) * num_prior + tf.reshape(indices, [-1]))

        y_true_pos = y_true[tf.equal(y_true[:, :, 4], 0)]
        y_pred_pos = y_pred[tf.equal(y_true[:, :, 4], 0)]
        y_true_neg = tf.gather(tf.reshape(y_true, (-1, 8)), axis=0, indices=full_indices)
        y_pred_neg = tf.gather(tf.reshape(y_pred, (-1, 8)), axis=0, indices=full_indices)

        y_true_valid = tf.concat([y_true_pos, y_true_neg], axis=0)
        y_pred_valid = tf.concat([y_pred_pos, y_pred_neg], axis=0)

        loc_loss = self.l1_smooth_loss(y_true_pos[:, :4], y_pred_pos[:, :4])
        conf_loss = self.softmax_loss(y_true_valid[:, 4:], y_pred_valid[:, 4:])

        return tf.reduce_mean(loc_loss) + tf.reduce_mean(conf_loss)


# 根据网络预测解码函数，获得候选框
def decoder(loc):
    # 获得先验框的中心与宽高
    prior_center_x = prior_center_wh[:, 0]
    prior_center_y = prior_center_wh[:, 1]
    prior_width = prior_center_wh[:, 2]
    prior_height = prior_center_wh[:, 3]

    # 获得真实框的中心与宽高
    decode_bbox_center_x = loc[:, 0] * prior_width * variances[0] + prior_center_x
    decode_bbox_center_y = loc[:, 1] * prior_height * variances[1] + prior_center_y
    decode_bbox_width = np.exp(loc[:, 2] * variances[2]) * prior_width
    decode_bbox_height = np.exp(loc[:, 3] * variances[3]) * prior_height

    # 获取真实框的左上角与右下角
    decode_bbox_xmin = decode_bbox_center_x - 0.5 * decode_bbox_width
    decode_bbox_ymin = decode_bbox_center_y - 0.5 * decode_bbox_height
    decode_bbox_xmax = decode_bbox_center_x + 0.5 * decode_bbox_width
    decode_bbox_ymax = decode_bbox_center_y + 0.5 * decode_bbox_height

    # 真实框的左上角与右下角进行堆叠
    decode_bbox = np.concatenate((decode_bbox_xmin[:, np.newaxis], decode_bbox_ymin[:, np.newaxis], decode_bbox_xmax[:, np.newaxis], decode_bbox_ymax[:, np.newaxis]), axis=-1)
    # 防止超出0与1
    decode_bbox = np.minimum(np.maximum(decode_bbox, 0.0), 1.0)
    return decode_bbox


# 将候选框进行非极大值抑制，获得最终的预测框
def detection_out(pred):
    # 回归网络预测结果
    mbox_loc = pred[:, :4]
    # 分类网络预测结果
    mbox_conf = pred[:, 4:]
    results = []
    # 对每一个图像进行处理
    decode_bbox = decoder(mbox_loc)
    for c in range(1, num_class):
        c_confs = mbox_conf[:, c]
        c_confs_mask = c_confs > confidence_threshold
        if len(c_confs[c_confs_mask]) > 0:
            # 取出得分高于confidence_threshold的框
            boxes_to_process = decode_bbox[c_confs_mask]
            confs_to_process = c_confs[c_confs_mask]
            # 进行iou的非极大抑制
            idx = tf.image.non_max_suppression(boxes_to_process, confs_to_process, max_output_size=keep_top_k, iou_threshold=nms_thresh)
            idx = idx.numpy()
            # 取出在非极大抑制中效果较好的内容
            box = boxes_to_process[idx]
            confs = confs_to_process[idx][:, np.newaxis]
            # 将label、置信度、框的位置进行堆叠。
            labels = c * np.ones((len(idx), 1))
            c_pred = np.concatenate((labels, confs, box), axis=1)
            # 添加进result里
            results.extend(c_pred)
    if len(results) > 0:
        # 按照置信度进行排序
        results = np.array(results)
        arg = np.argsort(results[:, 1])[::-1][:keep_top_k]
        results = results[arg]
    return results


# 将图像进行预测并画框
def detect_image(filename):

    test_img = cv.imread(filename)
    preds = tf.squeeze(model.predict(tf.constant([test_img / 127.5 - 1])), axis=0).numpy()

    # 将预测结果进行解码
    results = detection_out(preds)

    if len(results) <= 0:
        return test_img
    print(filename)
    # 筛选出其中得分高于confidence的框
    det_label = results[:, 0]
    det_conf = results[:, 1]
    det_xmin, det_ymin, det_xmax, det_ymax = results[:, 2], results[:, 3], results[:, 4], results[:, 5]
    indices = [index for index, conf in enumerate(det_conf) if conf >= confidence_threshold]
    top_conf = det_conf[indices]
    top_label_indices = det_label[indices].tolist()
    top_xmin = np.expand_dims(det_xmin[indices], -1) * img_size[1]
    top_ymin = np.expand_dims(det_ymin[indices], -1) * img_size[0]
    top_xmax = np.expand_dims(det_xmax[indices], -1) * img_size[1]
    top_ymax = np.expand_dims(det_ymax[indices], -1) * img_size[0]
    boxes = np.concatenate([top_xmin, top_ymin, top_xmax, top_ymax], axis=-1)

    font = cv.FONT_HERSHEY_SIMPLEX

    for i, c in enumerate(top_label_indices):
        cls = int(c) - 1
        predicted_class = classes[cls]
        score = top_conf[i]

        left, top, right, bottom = boxes[i]
        left = left - expand
        top = top - expand
        right = right + expand
        bottom = bottom + expand

        left = max(0, np.floor(left + 0.5).astype('int32'))
        top = max(0, np.floor(top + 0.5).astype('int32'))
        right = min(img_size[1], np.floor(right + 0.5).astype('int32'))
        bottom = min(img_size[0], np.floor(bottom + 0.5).astype('int32'))

        # 画框
        label = '{} {:.2f}'.format(predicted_class, score)

        cv.rectangle(test_img, (left, top), (right, bottom), colors[cls], 1)
        cv.putText(test_img, label, (left, top - int(label_size * 10)), font, label_size, colors[cls], 1)
    return test_img


if __name__ == '__main__':
    # 包括背景的类别数目
    num_class = 4
    train_data = list(range(800))
    validation_data = list(range(800, 900))
    test_data = range(900, 1000)
    epochs = 100
    batch_size = 8
    tf.random.set_seed(22)
    img_size = (128, 128)
    classes = ["circle", "triangle", "square"]
    # 每个特征图上每个像素先验框的个数
    prior = [4, 4, 4, 4, 4]
    # 特征图的大小
    feature_map = [32, 16, 7, 3, 1]
    # 特征图上anchor的最小尺寸
    min_size = [4, 8, 16, 32, 64]
    # 特征图上anchor的最大尺寸
    max_size = [8, 16, 32, 64, 80]
    # anchor的长宽比
    ratios = [[2], [2], [2], [2], [2]]
    # 先验框的个数
    num_prior = sum([prior[x] * feature_map[x] ** 2 for x in range(len(prior))])
    # 先验框与预测框的解码方差
    variances = [0.1, 0.1, 0.2, 0.2]
    # 获取所有先验框
    prior_center_wh = []
    prior_lt_rb = []
    for i in range(len(prior)):
        c_wh, tl_br = get_prior(i + 1)
        prior_center_wh.append(c_wh)
        prior_lt_rb.append(tl_br)
    prior_center_wh = np.vstack(prior_center_wh)
    prior_lt_rb = np.vstack(prior_lt_rb)

    # IOU超过阈值的视为正样本
    overlap_threshold = 0.5
    # 负样本与正样本的比例
    neg_pos_ratio = 3
    # 回归损失函数的比例
    alpha = 1
    # 如果图像中不存在正样本，则指定最低负样本个数
    negatives_for_hard = 10
    # 编码函数
    f_encode = encoder
    # 画框设置不同的颜色
    hsv_tuples = [(x / (num_class - 1), 1., 1.) for x in range(num_class - 1)]
    colors = list(map(lambda x: colorsys.hsv_to_rgb(*x), hsv_tuples))
    colors = list(map(lambda x: (int(x[1] * 255), int(x[2] * 255), int(x[0] * 255)), colors))
    # 设置图像检测最多的框数目
    keep_top_k = 5
    # 设置检测置信度，大于该值认为有物体
    confidence_threshold = 0.5
    # 非极大值抑制阈值，重叠度不得大于该值
    nms_thresh = 0.5
    # 预测框不要紧贴物体，向外扩展像素大小
    expand = 5
    # 标签大小
    label_size = 0.3

    imgs_path = r'.\shape\train_imgs'
    annotations_path = r'.\shape\annotations'
    test_path = r'.\shape\test_imgs'
    save_path = r'.\SSD_test_result'
    weight_path = r'.\SSD_weight'
    bbox_path = r'.\shape\bbox.txt'

    # 将xml存储的bbox转换为bbox.txt文件，内容为file_path + bbox + class_id
    if 'bbox.txt' not in os.listdir(r'.\shape'):
        get_bbox(train_data + validation_data, bbox_path, annotations_path)

    with open(bbox_path, 'r') as f:
        bounding_info = f.readlines()

    try:
        os.mkdir(save_path)
    except FileExistsError:
        print(save_path + 'has been exist')

    try:
        os.mkdir(weight_path)
    except FileExistsError:
        print(weight_path + 'has been exist')

    model = small_ssd(input_shape=(img_size[0], img_size[1], 3))

    model.build(input_shape=(batch_size, img_size[0], img_size[1], 3))
    model.summary()

    optimizor = keras.optimizers.Adam(lr=1e-4)
    lossor = Loss().compute_loss

    model.compile(optimizer=optimizor, loss=lossor)

    # 保存的方式，3世代保存一次
    checkpoint_period = keras.callbacks.ModelCheckpoint(
        weight_path + '\\' + 'ep{epoch:03d}-loss{loss:.3f}-val_loss{val_loss:.3f}.h5',
        monitor='val_loss',
        save_weights_only=True,
        save_best_only=True,
        period=3
    )

    # 学习率下降的方式，val_loss3次不下降就下降学习率继续训练
    reduce_lr = keras.callbacks.ReduceLROnPlateau(
        monitor='val_loss',
        factor=0.5,
        patience=3,
        verbose=1
    )

    # 是否需要早停，当val_loss一直不下降的时候意味着模型基本训练完毕，可以停止
    early_stopping = keras.callbacks.EarlyStopping(
        monitor='val_loss',
        min_delta=0,
        patience=10,
        verbose=1
    )

    model.fit_generator(generate_arrays_from_file(train_data, batch_size),
                        steps_per_epoch=max(1, len(train_data) // batch_size),
                        validation_data=generate_arrays_from_file(validation_data, batch_size),
                        validation_steps=max(1, len(validation_data) // batch_size),
                        epochs=epochs,
                        callbacks=[checkpoint_period, reduce_lr, early_stopping])

    for name in test_data:
        test_img_path = test_path + '\\' + str(name) + '.jpg'
        save_img_path = save_path + '\\' + str(name) + '.png'
        test_img = detect_image(test_img_path)
        cv.imwrite(save_img_path, test_img)

模型运行结果

SSD

SSD小结

SSD是一种简单的目标检测网络，从上图可以看出SSD模型的参数量只有26M，由于其结构简单，效果稳定，因此很多场合仍然使用SSD作为目标检测算法。SSD作为一步法目标检测的元老级模型，是小伙伴们需要掌握的一个模型。