import numpy as np
import torchvision
import torch
import torchvision.transforms as transforms
from PIL import Image, ImageDraw
import matplotlib.pyplot as plt
import os
import xml.etree.ElementTree as ET

1. 目标检测和边界框

# 测试图像
# imagepath = 'E:\学习\机器学习\数据集\VOC2012\VOCdevkit\VOC2012\JPEGImages\\2007_001423.jpg'
imagepath = 'E:\学习\机器学习\数据集\VOC2012\VOCdevkit\VOC2012\JPEGImages\\2007_001526.jpg'
# 读取并显示图像
image = plt.imread(imagepath)
plt.imshow(image)

<matplotlib.image.AxesImage at 0x22666b24208>

对于这张带标注的图像，可以从对于的xml文件中获取其坐标信息，脚本如下

# 功能：输入图像路径，在给定鲁中的xml文件夹中寻找，获取标志位置信息并返回
def image_to_boxes(imagepath):
    xmlpath = 'E:\学习\机器学习\数据集\VOC2012\VOCdevkit\VOC2012\Annotations'
    # 根据文件名提取出xml文件路径
    imagename = imagepath.split('\\')[-1].split('.')[0]
    xmlpath = os.path.join(xmlpath, imagename + '.xml')
#     print(xmlpath)
    # 获取xml文件的对象信息
    root = ET.parse(xmlpath).getroot()
    objects = root.findall('object')
    # 遍历全部的对象
    boxes = []
    for obj in objects:
        # 获取对象的左上角与右下角坐标
        bbox = obj.find('bndbox')
        xmin = int(float(bbox.find('xmin').text.strip()))
        ymin = int(float(bbox.find('ymin').text.strip()))
        xmax = int(float(bbox.find('xmax').text.strip()))
        ymax = int(float(bbox.find('ymax').text.strip()))
        # 追加信息保存
        boxes.append([xmin, ymin, xmax, ymax])
    # 返回对象坐标位置列表
    return torch.tensor(boxes)

测试

boxes = image_to_boxes(imagepath)
boxes

tensor([[  3,  18, 114, 298],
        [109,  51, 214, 297],
        [212,  35, 316, 297],
        [289,  33, 387, 297],
        [381,  15, 500, 297]])

现在可以从图像中获取到坐标信息，边界框是矩形的，由矩形左上角的 x 和 y 坐标以及右下角的坐标决定。另一种常用的边界框表示方法是边界框中心的 (x,y) 轴坐标以及框的宽度和高度。

# 将左上，右下）转换到（中间，宽度，高度）
def box_corner_to_center(boxes):
    # 将列表数据转换为tensor格式
    boxes = torch.tensor(boxes)
    # 获取左上角与右下角坐标列表
    x1, y1, x2, y2 = boxes[:, 0], boxes[:, 1], boxes[:, 2], boxes[:, 3]
    # 列表的数据处理
    cx = (x1 + x2) / 2
    cy = (y1 + y2) / 2
    w = x2 - x1
    h = y2 - y1
    # 拼接处理，由于是tensor，所以使用torch的stack函数
    # axis=0 时会进行数据的行拼接
    # axis=-1 时会进行数据的列拼接
    boxes = torch.stack((cx, cy, w, h), axis=-1)
    return boxes
boxes = box_corner_to_center(boxes)
boxes

E:\anacanda\envs\project\lib\site-packages\ipykernel_launcher.py:5: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).
  """
tensor([[ 0.3295,  0.5800, -0.0630,  0.0600],
        [ 0.3173,  0.5640, -0.0445,  0.0120],
        [ 0.2975,  0.4858, -0.0310, -0.0815],
        [ 0.2350,  0.3733, -0.2060, -0.1965]])

# 从（中间，宽度，高度）转换到（左上，右下）
def box_center_to_corner(boxes):
    # 将列表数据转换为atensor格式
    boxes = torch.tensor(boxes)
    # 获取中心坐标已经宽高值
    cx, cy, w, h = boxes[:, 0], boxes[:, 1], boxes[:, 2], boxes[:, 3]
    # 列表的数据处理
    x1 = cx - 0.5 * w
    y1 = cy - 0.5 * h
    x2 = cx + 0.5 * w
    y2 = cy + 0.5 * h
    # 拼接处理，由于是tensor，所以使用torch的stack函数
    # axis=0 时会进行数据的行拼接
    # axis=-1 时会进行数据的列拼接
    boxes = torch.stack((x1, y1, x2, y2), axis=-1)
    return boxes
boxes = box_center_to_corner(boxes)
boxes

E:\anacanda\envs\project\lib\site-packages\ipykernel_launcher.py:5: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).
  """
tensor([[0.3610, 0.5500, 0.2980, 0.6100],
        [0.3395, 0.5580, 0.2950, 0.5700],
        [0.3130, 0.5265, 0.2820, 0.4450],
        [0.3380, 0.4715, 0.1320, 0.2750]])

可以看见，经过两次的转换，数据会变回原来的格式

box_center_to_corner(box_corner_to_center(boxes)) == boxes

E:\anacanda\envs\project\lib\site-packages\ipykernel_launcher.py:5: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).
  """
tensor([[True, True, True, True],
        [True, True, True, True],
        [True, True, True, True],
        [True, True, True, True],
        [True, True, True, True]])

现在尝试在原图上根据这些坐标绘制出边界框。这里定义一个辅助函数 bbox_to_rect。它将边界框表示成 matplotlib 的边界框格式

# 将边界框 (左上x, 左上y, 右下x, 右下y) 格式转换成 matplotlib 格式：
# ((左上x, 左上y), 宽, 高)
def bbox_to_rect(bbox, color, linewidth=2):
    # 注意，这里输入的是单个边界框
    xy = (bbox[0], bbox[1])   # 左上角坐标
    width = bbox[2]-bbox[0]   # 右下角的x坐标 - 左上角的x坐标
    height = bbox[3]-bbox[1]  # 右下角的y坐标 - 左上角的y坐标
    # 返回matplotlib 的边界框格式
    # fill=False: 取消填充功能，否则不是边界框而是一个色块
    # edgecolor： 边界框颜色
    # linewidth:  边界框的宽度
    return plt.Rectangle(xy, width, height, fill=False, edgecolor=color, linewidth=linewidth)

在原图上绘制边界框

color = ['blue','red','yellow','gray','pink']
# 显示图片
img = plt.imshow(image)
# 取消坐标轴
plt.axis('off')
# 不同的边界框使用不同的颜色框住
for index, bbox in enumerate(boxes):
    # 以添加补丁的方式在原图上绘制边界框
    img.axes.add_patch(bbox_to_rect(bbox, color[index], linewidth=3))

2. 锚框

torch.set_printoptions(2)  # 精简打印精度

要生成多个不同形状的锚框，让我们设置一系列刻度 s1,…,sn 和一系列宽高比 r1,…,rm 。当使用这些比例和长宽比的所有组合以每个像素为中心时，输入图像将总共有 whnm 个锚框。

尽管这些锚框可能会覆盖所有地面真实边界框，但计算复杂性很容易过高。在实践中，我们只考虑包含 s1 或 r1 的组合：

(s1,r1),(s1,r2),…,(s1,rm),(s2,r1),(s3,r1),…,(sn,r1)

# 功能： 指定输入图像、尺度列表和宽高比列表，然后此函数将生成以每个像素为中心具有不同形状的锚框，返回所有的锚框
def multibox_prior(data, sizes, ratios):
    device = data.device
    size_tensor = torch.tensor(sizes, device=device)
    ratio_tensor = torch.tensor(ratios, device=device)
    print(data.shape)
    # 获取图像宽高
    img_height, img_width = data.shape[-2:]
    # 避免anchor太密集，只挑选特定的boxes
    boxes_per_pixel = len(sizes) + len(ratios) - 1
    # 获取每个像素的中心点
    steps_h = 1.0 / img_height  # 高度步长
    steps_w = 1.0 / img_width   # 宽度步长
    # 根据图像像素点位置 * 步长 来实现归一化处理，使得图像尺寸计算为1
    # 0.5 指的是像素点中心位置的偏移量
    center_h = (torch.arange(img_height, device=device) + 0.5) * steps_h
    center_w = (torch.arange(img_width, device=device) + 0.5) * steps_w
    # print(center_h.shape, center_w.shape)   # torch.Size([333]) torch.Size([500])
    # 根据步长位置构建每个像素点的坐标信息
    shift_y, shift_x = torch.meshgrid(center_h, center_w)
    # print(shift_y.shape, shift_x.shape)     # torch.Size([333, 500]) torch.Size([333, 500])
    # 分别转换成列表，方便拼接，其中(shift_x, shift_y)就代表了图像中全部像素点的中心坐标
    shift_y, shift_x = shift_y.reshape(-1), shift_x.reshape(-1)
    # 现在对(shift_x, shift_y)进行拼接，方便一会转换成左上角与右下角的坐标格式，所以需要设置两组坐标
    # 其中参数dim=1表示的是对列进行拼接
    center_point = torch.stack([shift_x, shift_y, shift_x, shift_y], dim=1)
    # 由于每个像素点会生成(n+m−1)个anchor，所以需要对坐标列表重复5次
    # repeat_interleave函数是对每一行分别进行先复制; repeat函数是对每一块分别进行复制
    center_point = center_point.repeat_interleave(boxes_per_pixel, dim=0)
    # print(center_point)
    # 现在构造出了中心点坐标，接着需要构造偏移信息列表，使中心坐标+偏移量就转换成转换成左上角与右下角的坐标格式
    # 其中: anchor_w = s * sqrt（w * h * r）   anchor_h = s * sqrt（w * h / r）
    # 这样使得 anchor_w / anchor_h = r   anchor_w * anchor_h = (ws)*(hs)
    # anchor_w = torch.cat((size_tensor * torch.sqrt(ratio_tensor[0]), size_tensor[0] * torch.sqrt(ratio_tensor[1:]))) \
    #            * math.sqrt(img_width * img_height)
    # anchor_h = torch.cat((size_tensor / torch.sqrt(ratio_tensor[0]), size_tensor[0] / torch.sqrt(ratio_tensor[1:]))) \
    #            * math.sqrt(img_width * img_height)
    # anchor_w, anchor_h:
    # tensor([306.0331, 204.0221, 102.0110, 432.7961, 216.3981])
    # tensor([306.0331, 204.0221, 102.0110, 216.3981, 432.7962])
    # 现在得到的5个anchor是在图像上的像素大小，需要同样对其进行归一化操作
    # 而另一种方法是:
    # 其中size值的是相比原图的大小, ratio值的宽高比
    anchor_w = torch.cat((size_tensor * torch.sqrt(ratio_tensor[0]), size_tensor[0] * torch.sqrt(ratio_tensor[1:])))   \
                * img_height / img_width  # 由于图像一般是矩形的，为了显示出是正方形，这里需要对宽度做一个缩放因子
    anchor_h = torch.cat((size_tensor / torch.sqrt(ratio_tensor[0]), size_tensor[0] / torch.sqrt(ratio_tensor[1:])))
    # anchor_w, anchor_h:
    # tensor([0.4995, 0.3330, 0.1665, 0.7064, 0.3532])
    # tensor([0.7500, 0.5000, 0.2500, 0.5303, 1.0607])
    print(anchor_w)
    print(anchor_h)
    # 获得偏移量
    anchor_offset = torch.stack((-anchor_w, -anchor_h, anchor_w, anchor_h))
    anchor_offset = anchor_offset.T.repeat(img_height * img_width, 1) / 2    # 先转置再按偏移块来重复
    # 更加中心点坐标与偏移量，获取anchor
    anchors = center_point + anchor_offset
    return anchors.unsqueeze(0)

为了显示以图像中一个像素为中心的所有锚框，我们定义了以下 show_bboxes 函数来在图像上绘制多个边界框

# 功能: 显示一个像素点上的所有边界框(这里设置了一个像素点上会有5个anchor)
def show_bboxes(axes, bboxes, labels=None, colors=None):
    # 如果没有传入颜色设置，这里会进行颜色一个初始化设置
    if colors is None:
        colors = ['blue', 'red', 'green', 'gray', 'pink']
    # 如果没有传入标签设置，这里会进行标签一个初始化设置
    if labels is None:
        labels = [i for i in range(len(bboxes))]
    # print(labels)
    # 以增加补丁的方式在原图上绘制矩形框
    for i, bbox in enumerate(bboxes):
        color = colors[i % len(colors)]
        rect = bbox_to_rect(bbox, color)  # 循环采用列表中的5种颜色
        # 增加矩形框补丁
        axes.add_patch(rect)
        # 增加文本补丁
        axes.text(rect.xy[0], rect.xy[1], labels[i], fontsize=20, color='white',
                  va='center', ha='center', bbox=dict(facecolor=color, edgecolor="black"))

测试图像，以下是测试代码，尝试生成图像上所以像素点的anchor框，然后挑选其中一个像素点对其的5个anchor进行绘制出来

# imagepath = 'E:\学习\机器学习\数据集\VOC2012\VOCdevkit\VOC2012\JPEGImages\\2007_001423.jpg'
imagepath = 'E:\学习\机器学习\数据集\VOC2012\VOCdevkit\VOC2012\JPEGImages\\2007_001526.jpg'
image = plt.imread(imagepath)
# print("image.shape:{}".format(image.shape))
h, w = image.shape[:2]
# print(h, w)
X = torch.rand(size=(1, 3, h, w))
# 设置大小比例以及宽高比(这里的sizes指是原图的一个比例大小)
sizes = [0.75, 0.5, 0.25]
ratios = [1, 2, 0.5]
# 另一种设置anchor的方法是指定不同anchor的面积
Y = multibox_prior(X, sizes, ratios)
boxes = Y.reshape(h, w, 5, 4)
print("boxes.shape:{}".format(boxes.shape))
# print(Y)
# 显示某一像素点为中心的所有anchor
print(boxes[250, 250, :, :])
# 显示边界框
fig = plt.imshow(image)
# 对于边界框的尺寸是归一化后的结果, 需要乘上原数值
bbox_scale = torch.tensor((w, h, w, h))
show_bboxes(fig.axes, boxes[250, 250, :, :] * bbox_scale)

torch.Size([1, 3, 298, 500])
tensor([0.4470, 0.2980, 0.1490, 0.6322, 0.3161])
tensor([0.7500, 0.5000, 0.2500, 0.5303, 1.0607])
boxes.shape:torch.Size([298, 500, 5, 4])
tensor([[0.2775, 0.4656, 0.7245, 1.2156],
        [0.3520, 0.5906, 0.6500, 1.0906],
        [0.4265, 0.7156, 0.5755, 0.9656],
        [0.1849, 0.5754, 0.8171, 1.1058],
        [0.3430, 0.3103, 0.6590, 1.3709]])

3. 交并比

接下来使用交并比来衡量锚框和真实边界框之间、以及不同锚框之间的相似度

先可以查看上面这幅图像的标注信息，由于这次的测试图像有5个人像，所以返回的标注信息也是5个

# imagepath = 'E:\学习\机器学习\数据集\VOC2012\VOCdevkit\VOC2012\JPEGImages\\2007_001423.jpg'
imagepath = 'E:\学习\机器学习\数据集\VOC2012\VOCdevkit\VOC2012\JPEGImages\\2007_001526.jpg'
gt_boxes = image_to_boxes(imagepath)
gt_boxes

[[3, 18, 114, 298],
 [109, 51, 214, 297],
 [212, 35, 316, 297],
 [289, 33, 387, 297],
 [381, 15, 500, 297]]

以刚刚所测试的边界框为示例，计算两个框之间的交并比，不过这里需要对边界框进行转换

# 查看转换后的边界框的真实坐标
pd_boxes = boxes[250, 250, :, :] * bbox_scale
pd_boxes

tensor([[138.7500, 138.7500, 362.2500, 362.2500],
        [176.0000, 176.0000, 325.0000, 325.0000],
        [213.2500, 213.2500, 287.7500, 287.7500],
        [ 92.4617, 171.4808, 408.5384, 329.5192],
        [171.4808,  92.4616, 329.5192, 408.5384]])

下面定义一个函数可以计算anchor(pd_boxes)与真实边界框(gt_boxes)之间的交并比

# 功能：计算两个锚框或边界框列表中成对的交并比
# 返回：返回的是每个真实框对每个预测边界框的交并比，eg：5个object，预测5个anchor，返回5x5的矩阵
def box_iou(boxes1, boxes2):
    # 如果不是tensor类型，需要转变为tensor类型
    if not isinstance(boxes1, torch.Tensor):
        boxes1 = torch.tensor(boxes1)
    if not isinstance(boxes2, torch.Tensor):
        boxes2 = torch.tensor(boxes2)
    # 利用左上角坐标与右下角坐标计算box面积
    box_area = lambda boxes: ((boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1]))
    # 分别得到两组边界框的面积
    areas1 = box_area(boxes1)  # 真实边界框的面积: torch.Size([5, 1])
    areas2 = box_area(boxes2)  # 预测边界框的面积: torch.Size([5])
    # 找到交叉框的左上角点，每个gt_box都需要与5个pred anchor配对计算Iou： torch.Size([5, 5, 2])
    # 其中，这里与下面的[:, None, :2]中的None起到的升维的作用，比较精妙
    inter_upperlefts = torch.max(boxes1[:, None, :2], boxes2[:, :2])
#     print(inter_upperlefts, inter_upperlefts.shape)
    # 找到交叉框的右下角点，每个gt_box都需要与5个pred anchor配对计算Iou： torch.Size([5, 5, 2])
    inter_lowerrights = torch.min(boxes1[:, None, 2:], boxes2[:, 2:])
#     print(inter_lowerrights, inter_lowerrights.shape)
    # 用右下角坐标 - 左上角坐标
    # 其中如果出现了负数，表面两个边界框没有交集，也就是交集面积为0，此时使用clamp函数限定最小值为0
    inters = (inter_lowerrights - inter_upperlefts).clamp(min=0)
#     print(inters, inters.shape)
    # 其中inters存储着交集的宽高，相乘即为面积大小: torch.Size([5, 5])
    inter_areas = inters[:, :, 0] * inters[:, :, 1]    
#     print(inter_areas, inter_areas.shape)
    # 每个gtbox面积分别加上预测ahnchor面积 - 交集面积，为每个gtbox对anchor的并集面积: torch.Size([5, 5])
    # 这里的None，将一维[5]的数据，升维成[5:1]的数据结构
    union_areas = areas1[:, None] + areas2 - inter_areas    
#     print(union_areas, union_areas.shape)
    # 其中Iou就为 并集 / 交集
    return inter_areas / union_areas

下面计算上面所使用的的黑人图像与在[250, 250]像素点所预测的5个anchor之间的交并比

iou = box_iou(gt_boxes, pd_boxes)
iou, iou.shape

(tensor([[0.0000, 0.0000, 0.0000, 0.0348, 0.0000],
         [0.1864, 0.1059, 0.0018, 0.2105, 0.1296],
         [0.2709, 0.3414, 0.2037, 0.2035, 0.3803],
         [0.1805, 0.0996, 0.0000, 0.1936, 0.1227],
         [0.0000, 0.0000, 0.0000, 0.0432, 0.0000]]),
 torch.Size([5, 5]))

对应着之前在250x250这个生成的像素点的图像观察

可以比较清楚的看见，对于第一个黑人的图像，其真实的标注框与0,1,2,4四个生成的anchor都没有联系，只与第3个anchor有少量的交集，可以看见，函数计算出来的交并比为[0.0000, 0.0000, 0.0000, 0.0348, 0.0000]，这是符合我们所观察到的数据的。也就是说，对于第一个真实标注框，其与250x250该像素点生成的5个anchor的交并比分别为：0.0000, 0.0000, 0.0000, 0.0348, 0.0000，其余的类似

至此，接下来需要对anchor进行训练。而训练需要对anchor设定类别与偏移量

4. 将真实边界框分配给锚框

给定图像，假设锚框是 A1,A2,…,Ana ，真实边界框是 B1,B2,…,Bnb ，其中 na≥nb 。让我们定义一个矩阵 X∈Rna×nb ，其中 ith 行和 jth 列中的元素 xij 是锚框 Ai 和真实边界框 Bj 的 IoU 。该算法包含以下步骤：

1）在矩阵 X 中找到最大的元素，并将它的行索引和列索引分别表示为 i1 和 j1 。然后将真实边界框 Bj1 分配给锚框 Ai1 。这很直观，因为 Ai1 和 Bj1 是所有锚框和真实边界框配对中最相近的。在第一个分配完成后，丢弃矩阵中 i1th 行和 j1th 列中的所有元素。

2）在矩阵 X 中找到剩余元素中最大的元素，并将它的行索引和列索引分别表示为 i2 和 j2 。我们将真实边界框 Bj2 分配给锚框 Ai2 ，并丢弃矩阵中 i2th 行和 j2th 列中的所有元素。

3）此时，矩阵 X 中两行和两列中的元素已被丢弃。我们继续，直到丢弃掉矩阵 X 中 nb 列中的所有元素。此时，我们已经为这 nb 个锚框各自分配了一个真实边界框。

4）只遍历剩下的 na−nb 个锚框。例如，给定任何锚框 Ai ，在矩阵 X 的第 ith 行中找到与 Ai 的IoU最大的真实边界框 Bj ，只有当此 IoU 大于预定义的阈值时，才将 Bj 分配给 Ai 。

用一个实际的例子说明以上过程：

让我们用一个具体的例子来说明上述算法。如图13.4.2（左）所示，假设矩阵 X 中的最大值为 x23 ，我们将真实边界框 B3 分配给锚框 A2 。然后，我们丢弃矩阵第 2 行和第 3 列中的所有元素，在剩余元素（阴影区域）中找到最大的 x71 ，然后将真实边界框 B1 分配给锚框 A7 。接下来，如图13.4.2（中）所示，丢弃矩阵第 7 行和第 1 列中的所有元素，在剩余元素（阴影区域）中找到最大的 x54 ，然后将真实边界框 B4 分配给锚框 A5 。最后，如图13.4.2（右）所示，丢弃矩阵第 5 行和第 4 列中的所有元素，在剩余元素（阴影区域）中找到最大的 x92 ，然后将真实边界框 B2 分配给锚框 A9 。之后，我们只需要遍历剩余的锚框 A1,A3,A4,A6,A8 ，然后根据阈值确定是否为它们分配真实边界框。

重新回顾一下真实边界框

imagepath = 'E:\学习\机器学习\数据集\VOC2012\VOCdevkit\VOC2012\JPEGImages\\2007_001526.jpg'
ground_truth = image_to_boxes(imagepath)
ground_truth

[[3, 18, 114, 298],
 [109, 51, 214, 297],
 [212, 35, 316, 297],
 [289, 33, 387, 297],
 [381, 15, 500, 297]]

显示150x150与150x300这两个像素点的anchor，每个像素点5个框，也就是一共有10个框。现在假设使用150x150与150x300这两个像素点的anchor对真实标注框进行预测，现在尝试将5个真实边界框分配给这10个anchor

# 显示边界框
# color = ['blue','red','yellow','gray','pink']
fig = plt.imshow(image)
bbox_scale = torch.tensor((w, h, w, h))
label = [i for i in range(10)]
show_bboxes(fig.axes, boxes[150, 150, :, :] * bbox_scale, labels=label[:5])
show_bboxes(fig.axes, boxes[150, 300, :, :] * bbox_scale, labels=label[5:])
for index, bbox in enumerate(gt_boxes):
    # 以添加补丁的方式在原图上绘制边界框
    fig.axes.add_patch(bbox_to_rect(bbox, 'black', linewidth=3))

# 功能: 传入gt_box信息与anchor信息, 将最接近的真实边界框分配给锚框
# 算法思想: 循环遍历ground_truth次, 每次找到全局中iou值最大的索引然后剔除行列数据, 重复操作, 为每一个ground_truth都找到一个anchor
def assign_anchor_to_bbox(ground_truth, anchors, device, iou_threshold=0.5):
    # 这里的ground_truth其实就是label标签
    num_anchors, num_gt_boxes = anchors.shape[0], ground_truth.shape[0]
    # 构建iou列表, 列标签为anchor, 行标签为ground_truth
    jaccard = box_iou(anchors, ground_truth)
    # 对于每个锚框，分配的真实边界框的张量
    anchors_bbox_map = torch.full((num_anchors,), -1, dtype=torch.long, device=device)
    # 赋予anchor与每个ground_truth的iou值的最大值, 并保存iou最大的ground_truth索引
    max_ious, indices = torch.max(jaccard, dim=1)
    # 根据阈值，决定是否分配真实边界框; 且获取对应索引
    anc_i = torch.nonzero(max_ious >= 0.5).reshape(-1)  # 获取高于阈值的索引
    box_j = indices[max_ious >= 0.5]     # 获取类别标签
    anchors_bbox_map[anc_i] = box_j      # 在anchor分配表中对高于阈值的anchor进行分配
    # 行列清除表
    col_discard = torch.full((num_anchors,), -1)
    row_discard = torch.full((num_gt_boxes,), -1)
    # 每次提取表格中最大的iou值, 剔除行列数据再进行下一次循环
    # 为每个ground_truth都找到对应iou值最大的anchor
    for _ in range(num_gt_boxes):
        # 由于没有设置dim, 这里返回的是全局最大的索引值(相当于将jaccard打平成一维的来计算)
        max_idx = torch.argmax(jaccard)
        box_idx = (max_idx % num_gt_boxes).long()   # iou值最大ground_truth的索引
        anc_idx = (max_idx / num_gt_boxes).long()   # iou值最大anchor的索引
        anchors_bbox_map[anc_idx] = box_idx     # 对应赋值
        jaccard[:, box_idx] = col_discard       # 清除所在列的全部数据
        jaccard[anc_idx, :] = row_discard       # 清除所在行的全部数据
    return anchors_bbox_map
# 功能: 传入anchor与分配完的ground_truth信息, 对锚框偏移量的转换
def offset_boxes(anchors, assigned_bb, eps=1e-6):
    # 将(左上，右下）转换到（中间，宽度，高度）形式
    c_anc = box_corner_to_center(anchors)
    c_assigned_bb = box_corner_to_center(assigned_bb)
    # [:, :2]表示xy坐标, [:, 2:]表示宽高, 根据公式进行设置
    # 将xy坐标分别相减再对应的除以宽高, 10是因为σx=σy=0.1, 5是因为σw=σh=0.2
    offset_xy = 10 * (c_assigned_bb[:, :2] - c_anc[:, :2]) / c_anc[:, 2:]
    offset_wh = 5 * torch.log(eps + c_assigned_bb[:, 2:] / c_anc[:, 2:])
    # 进行拼接再返回
    offset = torch.cat([offset_xy, offset_wh], axis=1)
    return offset
# 功能: 使用真实边界框标记锚框
def multibox_target(anchors, labels):
    # 其中这里的label相当于ground-true, torch.Size([1, 5, 5])
    print("anchors.shape:{}, labels.shape:{}".format(anchors.shape, labels.shape))
    # 这里的batch_size相当于表示有多少张图像, 这里的lable表示只有一张图像, 然后图像中有5个标注对象
    batch_size, anchors = labels.shape[0], anchors.squeeze(0)
    batch_offset, batch_mask, batch_class_labels = [], [], []
    # 指定设备与anchor的数量
    device, num_anchors = anchors.device, anchors.shape[0]
    # 这里的batch_size相当于有多少张图像, 循环处理
    for i in range(batch_size):
        # 提取第i张图像的标签信息
        label = labels[i, :, :]
        # 分配anchor, 这里的label标签的第一列是类别信息(暂时剔除), 而2-5列为标注的坐标信息
        anchors_bbox_map = assign_anchor_to_bbox(
            label[:, 1:], anchors, device)
        # anchors_bbox_map中大于0的值标志成1, 否则为0, 重复4遍, 对应4个坐标偏移量
        bbox_mask = ((anchors_bbox_map >= 0).float().unsqueeze(-1)).repeat(1, 4)
        # 将类标签和分配的边界框坐标初始化为零
        class_labels = torch.zeros(num_anchors, dtype=torch.long, device=device)
        assigned_bb = torch.zeros((num_anchors, 4), dtype=torch.float32, device=device)
        # 使用真实边界框来标记锚框的类别。
        # 如果一个锚框没有被分配，我们标记其为背景（值为零）
        indices_true = torch.nonzero(anchors_bbox_map >= 0)
        bb_idx = anchors_bbox_map[indices_true]
        class_labels[indices_true] = label[bb_idx, 0].long() + 1    # 对应的真实标签
        assigned_bb[indices_true] = label[bb_idx, 1:]               # 对应的真实边界框
        # 其中* bbox_mask表示没有匹配到ground_truth的anchor不需要进行处理
        offset = offset_boxes(anchors, assigned_bb) * bbox_mask     # 边界框进行偏移量转换
        # 对一张图像处理完之后进行信息添加
        batch_offset.append(offset.reshape(-1))
        batch_mask.append(bbox_mask.reshape(-1))
        batch_class_labels.append(class_labels)
    bbox_offset = torch.stack(batch_offset)
    bbox_mask = torch.stack(batch_mask)
    class_labels = torch.stack(batch_class_labels)
    return (bbox_offset, bbox_mask, class_labels)

测试代码：

if __name__ == '__main__':
    # imagepath = 'E:\学习\机器学习\数据集\VOC2012\VOCdevkit\VOC2012\JPEGImages\\2007_001423.jpg'
    imagepath = 'E:\学习\机器学习\数据集\VOC2012\VOCdevkit\VOC2012\JPEGImages\\2007_001526.jpg'
    image = plt.imread(imagepath)
    # print("image.shape:{}".format(image.shape))
    h, w = image.shape[:2]
    X = torch.rand(size=(1, 3, h, w))
    # 设置大小比例以及宽高比(这里的sizes指是原图的一个比例大小)
    sizes = [0.75, 0.5, 0.25]
    ratios = [1, 2, 0.5]
    # 另一种设置anchor的方法是指定不同anchor的面积
    Y = multibox_prior(X, sizes, ratios)
    boxes = Y.reshape(h, w, 5, 4)
    # print(boxes.shape)
    # 这里设置了两个像素点，所以有10个anchor
    anchors = torch.cat([boxes[150, 150, :, :], boxes[150, 300, :, :]], dim=0)  # 对数据进行拼接
    ground_truth = image_to_boxes(imagepath)    # 原尺寸的值
    print(ground_truth)
    ground_truth = gtboxes_process(X, ground_truth)   # 归一化处理后的值
    print(ground_truth)
    # 返回输入anchor的类，掩码变量以及每个anchor的偏移量
    labels = multibox_target(anchors.unsqueeze(dim=0), ground_truth.unsqueeze(dim=0))
    print(labels[2], labels[2].shape)
    print(labels[1], labels[1].shape)
    print(labels[0], labels[0].shape)

输出：

tensor([[  3,  18, 114, 298],
        [109,  51, 214, 297],
        [212,  35, 316, 297],
        [289,  33, 387, 297],
        [381,  15, 500, 297]])
tensor([[0.0000, 0.0060, 0.0604, 0.2280, 1.0000],
        [0.0000, 0.2180, 0.1711, 0.4280, 0.9966],
        [0.0000, 0.4240, 0.1174, 0.6320, 0.9966],
        [0.0000, 0.5780, 0.1107, 0.7740, 0.9966],
        [0.0000, 0.7620, 0.0503, 1.0000, 0.9966]])
anchors.shape:torch.Size([1, 10, 4]), labels.shape:torch.Size([1, 5, 5])
tensor([[0, 0, 0, 1, 1, 1, 0, 0, 1, 1]]) torch.Size([1, 10])
tensor([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 1., 1., 1., 1.,
         1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 1., 1.,
         1., 1., 1., 1.]]) torch.Size([1, 40])
tensor([[-0.0000, -0.0000, -0.0000, -0.0000, -0.0000, -0.0000, -0.0000, -0.0000,
         -0.0000, -0.0000, -0.0000, -0.0000, -2.9107,  0.4746, -5.2323,  2.8598,
          0.6960,  0.7435, -2.0444, -1.2533,  1.6779,  0.6488, -4.1222,  0.8327,
         -0.0000, -0.0000, -0.0000, -0.0000, -0.0000, -0.0000, -0.0000, -0.0000,
          4.4293,  0.3480, -4.8843,  2.8953, -2.3096,  0.4904, -2.0922, -0.9382]]) torch.Size([1, 40])

可以看见输出值包含了3个类别：分别是anchor偏移量，anchor掩码（负样本掩码为0，正样本掩码为1），还有anchor类别标签

5. 使用非极大值抑制预测边界框

anchors = torch.tensor([[ 0.2120,  0.2450,  0.5100,  0.8550],   # 0.9
                        [ 0.1920,  0.2730,  0.4870,  0.8430],   # 0.8
                        [ 0.1720,  0.3040,  0.4540,  0.7490],   # 0.7
                        [ 0.2720,  0.3340,  0.4040,  0.6090]])  # 0.3
offset_preds = torch.tensor([0] * anchors.numel())
cls_probs = torch.tensor([[0] * 4,  # 背景的预测概率
                      [0.9, 0.8, 0.7, 0.1]])  # 人的预测概率
fig = plt.imshow(image)
show_bboxes(fig.axes, anchors * bbox_scale)

# 功能: 将锚框和偏移量预测作为输入，并应用逆偏移变换来返回预测的边界框坐标
def offset_inverse(anchors, offset_preds):
    # 将(左上，右下）转换到（中间，宽度，高度）形式
    anc = box_corner_to_center(anchors)
    # 反公式推导
    pred_bbox_xy = (offset_preds[:, :2] * anc[:, 2:] / 10) + anc[:, :2]
    pred_bbox_wh = torch.exp(offset_preds[:, 2:] / 5) * anc[:, 2:]
    pred_bbox = torch.cat((pred_bbox_xy, pred_bbox_wh), axis=1)     # 先拼接再转换处理
    # 从（中间，宽度，高度）转换到（左上，右下）
    predicted_bbox = box_center_to_corner(pred_bbox)
    return predicted_bbox
# 功能: 实现NMS算法
def nms(boxes, scores, iou_threshold):
    # 对预测边界框的置信度进行排序
    B = torch.argsort(scores, dim=-1, descending=True)
    keep = []  # 保留预测边界框的指标
    while B.numel() > 0:
        # 首先获取置信度最高的索引, 添加在列表k中
        i = B[0]
        keep.append(i)
        # 如果剩下最后一个元素则返回
        if B.numel() == 1: break
        # 分别将置信度最高的anchor与其他的anchor计算iou值
        iou = box_iou(boxes[i, :].reshape(-1, 4),
                      boxes[B[1:], :].reshape(-1, 4)).reshape(-1)
        # 当与置信度最高的anchor的iou值比阈值低时，可以表明这是另外一类的对象，记录索引
        # nonzero返回的是非零数值的索引
        inds = torch.nonzero(iou <= iou_threshold).reshape(-1)
        # 由于前面计算的iou是与最高置信度的anchor进行比较，这里计算出收个不为0的iou时，需要考虑+1
        # 这样进行下一个循环时B里的第一个值就是可能是另外一类的对象
        B = B[inds + 1]
    return torch.tensor(keep, device=boxes.device)
# 功能: 使用非极大值抑制来预测边界框
# 返回一个二维列表, 第一列表示预测类别, 第二列表置信度, 其余四列表示预测边界框的左上角与右下角
def multibox_detection(cls_probs, offset_preds, anchors, nms_threshold=0.5,
                       pos_threshold=0.009999999):
    # 在前面扩维的目标是表示有几幅图像, 表示成batch_size
    device, batch_size = cls_probs.device, cls_probs.shape[0]
    anchors = anchors.squeeze(0)
    # 获取类别与anchor数量信息
    num_classes, num_anchors = cls_probs.shape[1], cls_probs.shape[2]
    out = []
    # 循环处理
    for i in range(batch_size):
        # 提取出每张图像的类别概率与偏移量
        cls_prob, offset_pred = cls_probs[i], offset_preds[i].reshape(-1, 4)
        # 从第二行非背景开始, 对概率表格对每一列输出置信度最大值及其对应索引， 索引即为对应类别
        # 由于这里设置的对象都是人, 所以输出类比均为0 (从0开始计数)
        conf, class_id = torch.max(cls_prob[1:], 0)
        # 逆偏移变换来返回预测的边界框坐标, 这里的offset_pred是网络预测的
        predicted_bb = offset_inverse(anchors, offset_pred)
        keep = nms(predicted_bb, conf, nms_threshold)
        # 找到所有的 non_keep 索引，并将类设置为背景
        all_idx = torch.arange(num_anchors, dtype=torch.long, device=device)
        combined = torch.cat((keep, all_idx))
        # 获取唯一值及其分别出现次数
        uniques, counts = combined.unique(return_counts=True)
        non_keep = uniques[counts == 1]
        all_id_sorted = torch.cat((keep, non_keep))
        # 抑制值置为-1, 表示重复度过高
        class_id[non_keep] = -1
        # 根据all_id_sorted来调整顺序, 将有可能是ground_truth的anchor弄到前列
        class_id = class_id[all_id_sorted]
        conf, predicted_bb = conf[all_id_sorted], predicted_bb[all_id_sorted]
        # pos_threshold 是一个用于非背景预测的阈值, 将置信度较低的预测边界框移除(id 置为 -1)
        below_min_idx = (conf < pos_threshold)
        class_id[below_min_idx] = -1
        conf[below_min_idx] = 1 - conf[below_min_idx]
        # 拼接, 添加列表, 对一副图像的处理完成
        pred_info = torch.cat((class_id.unsqueeze(1), conf.unsqueeze(1), predicted_bb), dim=1)
        out.append(pred_info)
    return torch.stack(out)

测试函数，其中设置图像有3个类别，0为背景类别，第二为人类别，第三类为其他类别

anchors = torch.tensor([[0.2120, 0.2450, 0.5100, 0.8550],  # 0.9
                            [0.1920, 0.2730, 0.4870, 0.8430],  # 0.8
                            [0.1720, 0.3040, 0.4540, 0.7490],  # 0.7
                            [0.2720, 0.3340, 0.4040, 0.6090]])  # 0.3
offset_preds = torch.tensor([0] * anchors.numel())
cls_probs = torch.tensor([[0] * 4,                  # 背景的预测概率
                          [0.9, 0.8, 0.7, 0.6],
                          [0.1, 0.2, 0.3, 0.4]])    # 人的预测概率
# 这里先进行扩维
output = multibox_detection(cls_probs.unsqueeze(dim=0),
                            offset_preds.unsqueeze(dim=0),
                            anchors.unsqueeze(dim=0),
                            nms_threshold=0.5)
print(output, output.shape)

E:\anacanda\envs\project\lib\site-packages\ipykernel_launcher.py:5: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).
  """
tensor([[[ 0.0000,  0.9000,  0.2120,  0.2450,  0.5100,  0.8550],
         [ 0.0000,  0.6000,  0.2720,  0.3340,  0.4040,  0.6090],
         [-1.0000,  0.8000,  0.1920,  0.2730,  0.4870,  0.8430],
         [-1.0000,  0.7000,  0.1720,  0.3040,  0.4540,  0.7490]]]) torch.Size([1, 4, 6])

可以看见输出结果可以除去重复度较高的iou预测边界框，并且输出类别信息。现在查看筛选出来的两个边界框，可以看见，相比之下去除了两个iou比较重复的边界框，使得结果更加清晰。

fig = plt.imshow(image)
    for i in output[0]:
        if i[0] == -1:
            continue
        label = ('dog=', 'cat=')[int(i[0])] + str(i[1])
        show_bboxes(fig.axes, [i[2:] * bbox_scale], label)
    plt.show()

【22】目标检测中锚框（anchor）的相关总结

1. 目标检测和边界框

2. 锚框

3. 交并比

4. 将真实边界框分配给锚框

5. 使用非极大值抑制预测边界框

热门文章

最新文章

相关电子书

热门

活动广场

任务中心

开发者评测

高校计划

乘风者计划

训练营

阿里云MVP

话题

直播

下载

镜像站

技术资料

插件

【22】目标检测中锚框（anchor）的相关总结

1. 目标检测和边界框

2. 锚框

3. 交并比

4. 将真实边界框分配给锚框

5. 使用非极大值抑制预测边界框

热门文章

最新文章

相关电子书