# TensorFlow 2 和 Keras 高级深度学习：11~13（1）

## 十一、对象检测

• 对象检测的概念
• 多尺度目标检测的概念
• SSD 作为多尺度目标检测算法
• tf.keras中 SSD 的实现

## 1. 对象检测

• y_cls或单热向量形式的类别或类
• y_box = ((x_min, y_min), (x_max, y_max))或像素坐标形式的边界框坐标

## 2. 锚框

10 x 8格的锚框，每个框的尺寸为(w/10, h/8)

20 x 15格的锚框，每个锚框的尺寸为(w/20, h/15)

40 x 30格的锚框，每个框的尺寸为(w/40, h/30)

(s[xj], s[yj])是“公式 11.2.1”中的第j个比例因子。

“列表 11.2.1”：锚框生成函数的layer_utils.py函数：

def anchor_boxes(feature_shape,
image_shape,
index=0,
n_layers=4,
aspect_ratios=(1, 2, 0.5)):
""" Compute the anchor boxes for a given feature map.
Anchor boxes are in minmax format
Arguments:
feature_shape (list): Feature map shape
image_shape (list): Image size shape
index (int): Indicates which of ssd head layers
are we referring to
n_layers (int): Number of ssd head layers
Returns:
boxes (tensor): Anchor boxes per feature map
"""
# anchor box sizes given an index of layer in ssd head
sizes = anchor_sizes(n_layers)[index]
# number of anchor boxes per feature map pt
n_boxes = len(aspect_ratios) + 1
# ignore number of channels (last)
image_height, image_width, _ = image_shape
# ignore number of feature maps (last)
feature_height, feature_width, _ = feature_shape
# normalized width and height
# sizes[0] is scale size, sizes[1] is sqrt(scale*(scale+1))
norm_height = image_height * sizes[0]
norm_width = image_width * sizes[0]
# list of anchor boxes (width, height)
width_height = []
# anchor box by aspect ratio on resized image dims
# Equation 11.2.3
for ar in aspect_ratios:
box_width = norm_width * np.sqrt(ar)
box_height = norm_height / np.sqrt(ar)
width_height.append((box_width, box_height))
# multiply anchor box dim by size[1] for aspect_ratio = 1
# Equation 11.2.4
box_width = image_width * sizes[1]
box_height = image_height * sizes[1]
width_height.append((box_width, box_height))
# now an array of (width, height)
width_height = np.array(width_height)
# dimensions of each receptive field in pixels
grid_width = image_width / feature_width
grid_height = image_height / feature_height
# compute center of receptive field per feature pt
# (cx, cy) format
# starting at midpoint of 1st receptive field
start = grid_width * 0.5
# ending at midpoint of last receptive field
end = (feature_width - 0.5) * grid_width
cx = np.linspace(start, end, feature_width)
start = grid_height * 0.5
end = (feature_height - 0.5) * grid_height
cy = np.linspace(start, end, feature_height)
# grid of box centers
cx_grid, cy_grid = np.meshgrid(cx, cy)
# for np.tile()
cx_grid = np.expand_dims(cx_grid, -1)
cy_grid = np.expand_dims(cy_grid, -1)
# tensor = (feature_map_height, feature_map_width, n_boxes, 4)
# aligned with image tensor (height, width, channels)
# last dimension = (cx, cy, w, h)
boxes = np.zeros((feature_height, feature_width, n_boxes, 4))
# (cx, cy)
boxes[..., 0] = np.tile(cx_grid, (1, 1, n_boxes))
boxes[..., 1] = np.tile(cy_grid, (1, 1, n_boxes))
# (w, h)
boxes[..., 2] = width_height[:, 0]
boxes[..., 3] = width_height[:, 1]
# convert (cx, cy, w, h) to (xmin, xmax, ymin, ymax)
# prepend one dimension to boxes
# to account for the batch size = 1
boxes = centroid2minmax(boxes)
boxes = np.expand_dims(boxes, axis=0)
return boxes
def centroid2minmax(boxes):
"""Centroid to minmax format
(cx, cy, w, h) to (xmin, xmax, ymin, ymax)
Arguments:
boxes (tensor): Batch of boxes in centroid format
Returns:
minmax (tensor): Batch of boxes in minmax format
"""
minmax= np.copy(boxes).astype(np.float)
minmax[..., 0] = boxes[..., 0] - (0.5 * boxes[..., 2])
minmax[..., 1] = boxes[..., 0] + (0.5 * boxes[..., 2])
minmax[..., 2] = boxes[..., 1] - (0.5 * boxes[..., 3])
minmax[..., 3] = boxes[..., 1] + (0.5 * boxes[..., 3])
return minmax

## 3. 真实情况锚框

 B[0] A[0] 0 A[1] 0.32 A[2] 0 A[3] 0 A[4] 0.30 A[5] 0

“表 11.3.1”每个锚框A[j ∈ 0 .. 5]的 IoU，带有对象边界框B[0]，如“图 11.3.1”所示。

“列表 11.3.1”显示了get_gt_data()的实现，该实现计算锚定框的真实情况标签。

“列表 11.3.1”：layer_utils.py

def get_gt_data(iou,
n_classes=4,
anchors=None,
labels=None,
normalize=False,
threshold=0.6):
"""Retrieve ground truth class, bbox offset, and mask
Arguments:
iou (tensor): IoU of each bounding box wrt each anchor box
n_classes (int): Number of object classes
anchors (tensor): Anchor boxes per feature layer
labels (list): Ground truth labels
normalize (bool): If normalization should be applied
threshold (float): If less than 1.0, anchor boxes>threshold
are also part of positive anchor boxes
Returns:
gt_class, gt_offset, gt_mask (tensor): Ground truth classes,
"""
# each maxiou_per_get is index of anchor w/ max iou
# for the given ground truth bounding box
maxiou_per_gt = np.argmax(iou, axis=0)
# get extra anchor boxes based on IoU
if threshold < 1.0:
iou_gt_thresh = np.argwhere(iou>threshold)
if iou_gt_thresh.size > 0:
extra_anchors = iou_gt_thresh[:,0]
extra_classes = iou_gt_thresh[:,1]
extra_labels = labels[extra_classes]
indexes = [maxiou_per_gt, extra_anchors]
maxiou_per_gt = np.concatenate(indexes,
axis=0)
labels = np.concatenate([labels, extra_labels],
axis=0)
# mask generation
# only indexes maxiou_per_gt are valid bounding boxes
gt_mask[maxiou_per_gt] = 1.0
# class generation
gt_class = np.zeros((iou.shape[0], n_classes))
# by default all are background (index 0)
gt_class[:, 0] = 1
# but those that belong to maxiou_per_gt are not
gt_class[maxiou_per_gt, 0] = 0
# we have to find those column indexes (classes)
maxiou_col = np.reshape(maxiou_per_gt,
(maxiou_per_gt.shape[0], 1))
label_col = np.reshape(labels[:,4],
(labels.shape[0], 1)).astype(int)
row_col = np.append(maxiou_col, label_col, axis=1)
# the label of object in maxio_per_gt
gt_class[row_col[:,0], row_col[:,1]]  = 1.0
# offsets generation
gt_offset = np.zeros((iou.shape[0], 4))
#(cx, cy, w, h) format
if normalize:
anchors = minmax2centroid(anchors)
labels = minmax2centroid(labels)
# bbox = bounding box
# ((bbox xcenter - anchor box xcenter)/anchor box width)/.1
# ((bbox ycenter - anchor box ycenter)/anchor box height)/.1
# Equation 11.4.8 Chapter 11
offsets1 = labels[:, 0:2] - anchors[maxiou_per_gt, 0:2]
offsets1 /= anchors[maxiou_per_gt, 2:4]
offsets1 /= 0.1
# log(bbox width / anchor box width) / 0.2
# log(bbox height / anchor box height) / 0.2
# Equation 11.4.8 Chapter 11
offsets2 = np.log(labels[:, 2:4]/anchors[maxiou_per_gt, 2:4])
offsets2 /= 0.2
offsets = np.concatenate([offsets1, offsets2], axis=-1)
# (xmin, xmax, ymin, ymax) format
else:
offsets = labels[:, 0:4] - anchors[maxiou_per_gt]
gt_offset[maxiou_per_gt] = offsets
return gt_class, gt_offset, gt_mask
def minmax2centroid(boxes):
"""Minmax to centroid format
(xmin, xmax, ymin, ymax) to (cx, cy, w, h)
Arguments:
boxes (tensor): Batch of boxes in minmax format
Returns:
centroid (tensor): Batch of boxes in centroid format
"""
centroid = np.copy(boxes).astype(np.float)
centroid[..., 0] = 0.5 * (boxes[..., 1] - boxes[..., 0])
centroid[..., 0] += boxes[..., 0]
centroid[..., 1] = 0.5 * (boxes[..., 3] - boxes[..., 2])
centroid[..., 1] += boxes[..., 2]
centroid[..., 2] = boxes[..., 1] - boxes[..., 0]
centroid[..., 3] = boxes[..., 3] - boxes[..., 2]
return centroid

maxiou_per_gt = np.argmax(iou, axis=0)实现了“公式 11.3.2”。 额外的阳性锚框是基于由iou_gt_thresh = np.argwhere(iou>threshold)实现的用户定义的阈值确定的。

gt_mask[maxiou_per_gt] = 1.0

# class generation
gt_class = np.zeros((iou.shape[0], n_classes))
# by default all are background (index 0)
gt_class[:, 0] = 1

# but those that belong to maxiou_per_gt are not
gt_class[maxiou_per_gt, 0] = 0
# we have to find those column indexes (classes)
maxiou_col = np.reshape(maxiou_per_gt,
(maxiou_per_gt.shape[0], 1))
label_col = np.reshape(labels[:,4],
(labels.shape[0], 1)).astype(int)
row_col = np.append(maxiou_col, label_col, axis=1)
# the label of object in maxio_per_gt
gt_class[row_col[:,0], row_col[:,1]]  = 1.0

row_col[:,0]是正锚框的索引，而row_col[:,1]是它们的非背景对象类的索引。 请注意，gt_class是单热点向量的数组。 这些值都为零，除了锚点框对象的索引处。 索引 0 是背景，索引 1 是第一个非背景对象，依此类推。 最后一个非背景对象的索引等于n_classes-1

gt_class[0] = [1.0, 0.0, 0.0, 0.0]

gt_class[1] = [0.0, 0.0, 1.0, 0.0]

# (xmin, xmax, ymin, ymax) format
else:
offsets = labels[:, 0:4] - anchors[maxiou_per_gt]

#(cx, cy, w, h) format
if normalize:
anchors = minmax2centroid(anchors)
labels = minmax2centroid(labels)
# bbox = bounding box
# ((bbox xcenter - anchor box xcenter)/anchor box width)/.1
# ((bbox ycenter - anchor box ycenter)/anchor box height)/.1
# Equation 11.4.8
offsets1 = labels[:, 0:2] - anchors[maxiou_per_gt, 0:2]
offsets1 /= anchors[maxiou_per_gt, 2:4]
offsets1 /= 0.1
# log(bbox width / anchor box width) / 0.2
# log(bbox height / anchor box height) / 0.2
# Equation 11.4.8
offsets2 = np.log(labels[:, 2:4]/anchors[maxiou_per_gt, 2:4])
offsets2 /= 0.2
offsets = np.concatenate([offsets1, offsets2], axis=-1)

## 4. 损失函数

• L_cls - y_cls的分类交叉熵损失
• L_off - L1 或 L2，用于y_cls。 请注意，只有正锚框有助于L_off L1，也称为平均绝对误差MAE）损失，而 L2 也称为均方误差MSE）损失。

• y_cls或单热向量形式的类别或类
• y_off = ((x_omin, y_omin), (x_omax, y_omax))或相对于锚框的像素坐标形式的偏移。

y_off = ((x_omin, y_omin), (x_omax, y_omax)) (Equation 11.4.2)

SSD 是一种监督对象检测算法。 可以使用以下基本真值：

• y_label或要检测的每个对象的类标签
• y_gt = (x_gmin, x_gmax, y_gmin, y_gmax)或地面真实偏差，其计算公式如下：

y_gt = (x_bmin – x_amin, x_bmax – x_amax, y_bmin – y_amin, y_bmax – y_amax) (Equation 11.4.3)

(Equation 11.4.4)

(w[b], h[b]) = (x_max – x_min, y_max - y_min) (Equation 11.4.6)

“列表 11.4.1”：loss.py L1 和平滑 L1 损失函数

from tensorflow.keras.losses import Huber
"""Pre-process ground truth and prediction data"""
# 1st 4 are offsets
offset = y_true[..., 0:4]
# pred is actually duplicated for alignment
# either we get the 1st or last 4 offset pred
pred = y_pred[..., 0:4]
return offset, pred
def l1_loss(y_true, y_pred):
"""MAE or L1 loss
"""
# we can use L1
return K.mean(K.abs(pred - offset), axis=-1)
def smooth_l1_loss(y_true, y_pred):
"""Smooth L1 loss using tensorflow Huber loss
"""
# Huber loss as approx of smooth L1
return Huber()(offset, pred)

“列表 11.4.2”：loss.py焦点损失

def focal_loss_categorical(y_true, y_pred):
"""Categorical cross-entropy focal loss"""
gamma = 2.0
alpha = 0.25
# scale to ensure sum of prob is 1.0
y_pred /= K.sum(y_pred, axis=-1, keepdims=True)
# clip the prediction value to prevent NaN and Inf
epsilon = K.epsilon()
y_pred = K.clip(y_pred, epsilon, 1\. - epsilon)
# calculate cross entropy
cross_entropy = -y_true * K.log(y_pred)
# calculate focal loss
weight = alpha * K.pow(1 - y_pred, gamma)
cross_entropy *= weight
return K.sum(cross_entropy, axis=-1)

