目标检测网络
目标检测网络:VGG16
定义VGG16
base = [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 'C', 512, 512, 512, 'M', 512, 512, 512] def vgg(i): layers = [] # 用于存放vgg网络的list in_channels = i # 最前面那层的维度--300*300*3,因此i=3 for v in base: # 在通道列表中进行遍历 if v == 'M': # 池化,补不补边界 layers += [nn.MaxPool2d(kernel_size=2, stride=2)] elif v == 'C': layers += [nn.MaxPool2d(kernel_size=2, stride=2, ceil_mode=True)] else: # 判断是否在池化层,卷积层都是3*3,v是输出通道数 conv2d = nn.Conv2d(in_channels, v, kernel_size=3, padding=1) # 每层卷积后使用ReLU激活函数 layers += [conv2d, nn.ReLU(inplace=True)] in_channels = v pool5 = nn.MaxPool2d(kernel_size=3, stride=1, padding=1) conv6 = nn.Conv2d(512, 1024, kernel_size=3, padding=6, dilation=6) # 通道(1024)膨胀,用卷积层代替全连接层 # # dilation=卷积核元素之间的间距,扩大卷积感受野的范围,没有增加卷积size,使用了空洞数为6的空洞卷积 conv7 = nn.Conv2d(1024, 1024, kernel_size=1) # 在conv7输出19*19的特征层 layers += [pool5, conv6, nn.ReLU(inplace=True), conv7, nn.ReLU(inplace=True)] # layers有20层 VGG前17层,加上pool5,conv6,conv7 return layers
建立自己的模型Mymodel
这里需要继承nn.Module
参数说明:
phase主要用来判断模型是处于训练阶段还是测试阶段,但为test字段时,直接用原SSD的Detect()函数就可以了。
num_classes:类的数量【参考了SSD,所以类别数量应为:自己的类数量+1(包含了背景类)】
confidence:置信度
nms_iou:NMS阈值
注意:在self.backbone = nn.ModuleList(vgg(3))是调用上面定义的vgg16网络代码,同时需要加上nn.ModuleList(),不然会报错。
PriorBox的代码后面将给出
class Mymodel(nn.Module): def __init__(self, phase, num_classes,confidence, nms_iou): super(Mymodel, self).__init__() self.phase = phase self.cfg = Config self.num_classes = num_classes self.backbone = nn.ModuleList(vgg(3)) box = [4] # 最后一个特征层锚框数量 self.priorbox = PriorBox(self.cfg) with torch.no_grad(): self.priors = Variable(self.priorbox.forward()) loc_layers = [nn.Conv2d(self.backbone[-2].out_channels, box[0] * 4, kernel_size=3,padding=1)] conf_layers = [nn.Conv2d(self.backbone[-2].out_channels, box[0] * num_classes, kernel_size = 3, padding = 1)] self.loc = nn.ModuleList(loc_layers) self.conf = nn.ModuleList(conf_layers) if phase == 'test': self.softmax = nn.Softmax(dim=-1) # 所有类的概率相加为1 # Detect(num_classes,bkg_label,top_k,conf_thresh,nms_thresh) # top_k:一张图片中,每一类的预测框的数量 # conf_thresh 置信度阈值 # nms_thresh:值越小表示要求的预测框重叠度越小,0.0表示不允许重叠 self.detect = Detect(num_classes, 0, 200, confidence, nms_iou) def forward(self, x): loc = list() # 用来存放位置回归结果 conf = list() # 用来存放分类回归结果 featrue = list() # 用来存储特征层 for k in range(len(self.backbone)): x = self.backbone[k](x) featrue.append(x) # 最后一个特征层为(batch_size,1024,19,19) for (x, l, c) in zip(featrue, self.loc, self.conf): loc.append(l(x).permute(0, 2, 3, 1).contiguous()) conf.append(c(x).permute(0, 2, 3, 1).contiguous()) loc = torch.cat([o.view(o.size(0), -1) for o in loc], 1) conf = torch.cat([o.view(o.size(0), -1) for o in conf], 1) if self.phase == "test": output = self.detect( loc.view(loc.size(0), -1, 4), self.softmax(conf.view(conf.size(0), -1, self.num_classes)), self.priors ) else: # 训练时期 output = ( loc.view(loc.size(0), -1, 4), conf.view(conf.size(0), -1, self.num_classes), self.priors ) return output
打印一下网络模型:
Mymodel( (backbone): ModuleList( (0): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (1): ReLU(inplace=True) (2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (3): ReLU(inplace=True) (4): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False) (5): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (6): ReLU(inplace=True) (7): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (8): ReLU(inplace=True) (9): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False) (10): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (11): ReLU(inplace=True) (12): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (13): ReLU(inplace=True) (14): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (15): ReLU(inplace=True) (16): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=True) (17): Conv2d(256, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (18): ReLU(inplace=True) (19): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (20): ReLU(inplace=True) (21): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (22): ReLU(inplace=True) (23): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False) (24): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (25): ReLU(inplace=True) (26): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (27): ReLU(inplace=True) (28): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (29): ReLU(inplace=True) (30): MaxPool2d(kernel_size=3, stride=1, padding=1, dilation=1, ceil_mode=False) (31): Conv2d(512, 1024, kernel_size=(3, 3), stride=(1, 1), padding=(6, 6), dilation=(6, 6)) (32): ReLU(inplace=True) (33): Conv2d(1024, 1024, kernel_size=(1, 1), stride=(1, 1)) (34): ReLU(inplace=True) ) (loc): ModuleList( (0): Conv2d(1024, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) ) (conf): ModuleList( (0): Conv2d(1024, 8, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) ) )
现在整体的网络就基本搭建完,接下来是定义先验框。
先验框的定义
这里先验框的定义是参考了SSD中的定义方法,当然大家也可以用YOLO的(但需要先聚类一下生成锚框),或者像centernet一样变为无锚框网络(我试了一下该方法,检测效果不太好)。
首先需要对生成的特征层生成网格点坐标矩阵,因为大小为19*19,所以每行每列将产生19个点,可以将特征层生成的网格画出来更明显的展示一下:
网格点坐标矩阵:网格点即网格之间交叉的点,然后这个点用坐标形式表示,将所有的x坐标放在一个X矩阵中,y坐标统一放一个Y的矩阵中。
我们已经知道了上述的坐标,那我们就可以计算得到每个网格的中心点位置。
然后我们可以计算一下每个网格中获得的默认框相对于原始图像的比例Sk值,代码中的self.min_sizes和self.max_sizes分别对应最小和最大先验框的尺度,这个尺度后面会说是怎么计算来了 。代码中的小尺寸和大尺寸正方形是每个网格产生的默认先验框,再通过aspect_ratios又会产生2个尺寸的长方形先验框,一共4个先验框。
参数self.steps=[16]是一个步长,特征图距离原始图大小的距离,或者是映射关系,比如现在的特征层大小为19*19,原始图300*300,则19*16≈300。
class PriorBox(object): def __init__(self, cfg): super(PriorBox, self).__init__() # 获得输入图片的大小,默认为300x300 self.image_size = 300 self.num_priors = 1 self.variance = [0.1] self.feature_maps = [19] # conv7输出的特征层大小 self.min_sizes = [60] # 对应最小的先验框尺寸 self.max_sizes = [111] # # 对应最大的先验框尺寸 self.steps = [16] # 相当于该特征层和原图的映射关系 self.aspect_ratios = [[2]] # 横纵比 self.clip = cfg['clip'] for v in self.variance: if v <= 0: raise ValueError('Variances must be greater than 0') def forward(self): mean = [] #----------------------------------------# # 获得1个大小为19*19的有效特征层用于预测 #----------------------------------------# for k, f in enumerate(self.feature_maps): #----------------------------------------# # 对特征层生成网格点坐标矩阵 #----------------------------------------# x,y = np.meshgrid(np.arange(f),np.arange(f)) x = x.reshape(-1) y = y.reshape(-1) #----------------------------------------# # 所有先验框均为归一化的形式 # 即在0-1之间 #----------------------------------------# for i, j in zip(y,x): f_k = self.image_size / self.steps[k] #----------------------------------------# # 计算网格的中心 #----------------------------------------# cx = (j + 0.5) / f_k cy = (i + 0.5) / f_k #----------------------------------------# # 获得小的正方形 #----------------------------------------# s_k = self.min_sizes[k]/self.image_size mean += [cx, cy, s_k, s_k] #----------------------------------------# # 获得大的正方形 #----------------------------------------# s_k_prime = sqrt(s_k * (self.max_sizes[k]/self.image_size)) mean += [cx, cy, s_k_prime, s_k_prime] #----------------------------------------# # 获得两个的长方形 #----------------------------------------# for ar in self.aspect_ratios[k]: mean += [cx, cy, s_k*sqrt(ar), s_k/sqrt(ar)] mean += [cx, cy, s_k/sqrt(ar), s_k*sqrt(ar)] output = torch.Tensor(mean).view(-1, 4) # 产生1444个先验框:19*19*4=1444 if self.clip: output.clamp_(max=1, min=0) return output
好了,现在我们要说一下min_sizes=[60]和max_size=[111]怎么计算来的 :
在SSD文中的ratio取[0.2,0.9]但在实际中,扩大了10陪。即范围为【20,90】
所以计算conv7特征层:min_sizes=min_dim*ratio/100=300*20/100=60,max_sizes=min_dim*(ratio+step)/100=300*(20+17)/100=111;注意这里的step不是steps,不一样。这里的step是SSD文中提到的这个公式:
Sk= Smin + (Smax-Smin)/(m-1) * (k-1),k∈[1,m],Smin=0.2,Smax=0.9
(Smax-Smin)/(m-1)=17,表示在[20,90]每17步取一次,即20,37,57,71,88各取一个Sk,用这些比例数字除以100,再乘原始图像大小。所以也可以这样算:如果是conv7,那么Sk等于20,min_sizes=20/100 * 300 = 60。
损失函数
损失函数这里我直接用的SSD中的损失函数,这里就不再过多叙述,可以直接看原论文。
class MultiBoxLoss(nn.Module): def __init__(self, num_classes, overlap_thresh, prior_for_matching, bkg_label, neg_mining, neg_pos, neg_overlap, encode_target, use_gpu=True, negatives_for_hard=100.0): super(MultiBoxLoss, self).__init__() self.use_gpu = use_gpu self.num_classes = num_classes self.threshold = overlap_thresh self.background_label = bkg_label self.encode_target = encode_target self.use_prior_for_matching = prior_for_matching self.do_neg_mining = neg_mining self.negpos_ratio = neg_pos self.neg_overlap = neg_overlap self.negatives_for_hard = negatives_for_hard self.variance = Config['variance'] #@torchsnooper.snoop() def forward(self, predictions, targets): #--------------------------------------------------# # 取出预测结果的三个值:回归信息,置信度,先验框 #--------------------------------------------------# loc_data, conf_data, priors = predictions #--------------------------------------------------# # 计算出batch_size和先验框的数量 #--------------------------------------------------# num = loc_data.size(0) num_priors = (priors.size(0)) #--------------------------------------------------# # 创建一个tensor进行处理 #--------------------------------------------------# loc_t = torch.zeros(num, num_priors, 4).type(torch.FloatTensor) conf_t = torch.zeros(num, num_priors).long() if self.use_gpu: loc_t = loc_t.cuda() conf_t = conf_t.cuda() priors = priors.cuda() for idx in range(num): # 获得真实框与标签 targets[x1,y1,x2,y2,label] truths = targets[idx][:, :-1] # 获取(x1,y1,x2,y2)坐标 labels = targets[idx][:, -1] # 获取(label) if(len(truths)==0): continue # 获得先验框 defaults = priors #--------------------------------------------------# # 利用真实框和先验框进行匹配。 # 如果真实框和先验框的重合度较高,则认为匹配上了。 # 该先验框用于负责检测出该真实框。 #--------------------------------------------------# match(self.threshold, truths, defaults, self.variance, labels, loc_t, conf_t, idx) #--------------------------------------------------# # 转化成Variable # loc_t (num, num_priors, 4) # conf_t (num, num_priors) #--------------------------------------------------# loc_t = Variable(loc_t, requires_grad=False) conf_t = Variable(conf_t, requires_grad=False) # 所有conf_t>0的地方,代表内部包含物体 pos = conf_t > 0 #--------------------------------------------------# # 求和得到每一个图片内部有多少正样本 # num_pos (num, ) #--------------------------------------------------# num_pos = pos.sum(dim=1, keepdim=True) #--------------------------------------------------# # 取出所有的正样本,并计算loss # pos_idx (num, num_priors, 4) #--------------------------------------------------# pos_idx = pos.unsqueeze(pos.dim()).expand_as(loc_data) loc_p = loc_data[pos_idx].view(-1, 4) loc_t = loc_t[pos_idx].view(-1, 4) loss_l = F.smooth_l1_loss(loc_p, loc_t, size_average=False) #--------------------------------------------------# # batch_conf (num * num_priors, num_classes) # loss_c (num, num_priors) #--------------------------------------------------# batch_conf = conf_data.view(-1, self.num_classes) # 这个地方是在寻找难分类的先验框 loss_c = log_sum_exp(batch_conf) - batch_conf.gather(1, conf_t.view(-1, 1)) loss_c = loss_c.view(num, -1) # 难分类的先验框不把正样本考虑进去,只考虑难分类的负样本 loss_c[pos] = 0 #--------------------------------------------------# # loss_idx (num, num_priors) # idx_rank (num, num_priors) #--------------------------------------------------# _, loss_idx = loss_c.sort(1, descending=True) _, idx_rank = loss_idx.sort(1) #--------------------------------------------------# # 求和得到每一个图片内部有多少正样本 # num_pos (num, ) # neg (num, num_priors) #--------------------------------------------------# num_pos = pos.long().sum(1, keepdim=True) # 限制负样本数量 num_neg = torch.clamp(self.negpos_ratio * num_pos, max = pos.size(1) - 1) num_neg[num_neg.eq(0)] = self.negatives_for_hard neg = idx_rank < num_neg.expand_as(idx_rank) #--------------------------------------------------# # 求和得到每一个图片内部有多少正样本 # pos_idx (num, num_priors, num_classes) # neg_idx (num, num_priors, num_classes) #--------------------------------------------------# pos_idx = pos.unsqueeze(2).expand_as(conf_data) neg_idx = neg.unsqueeze(2).expand_as(conf_data) # 选取出用于训练的正样本与负样本,计算loss conf_p = conf_data[(pos_idx + neg_idx).gt(0)].view(-1, self.num_classes) targets_weighted = conf_t[(pos + neg).gt(0)] loss_c = F.cross_entropy(conf_p, targets_weighted, size_average=False) N = torch.max(num_pos.data.sum(), torch.ones_like(num_pos.data.sum())) loss_l /= N loss_c /= N # embed() return loss_l, loss_c