2.2、BiFPN模块
如下图所示,BiFPN在图e的基础上增加了shortcut,这些都是在现有的一些工作的基础上添砖加瓦。
图 BiFPN与其他的特征融合方法的比较
但是,以往的特征融合方法对所有输入特征一视同仁,在BiFPN中则引入了加权策略,下边介绍本文提出来的加权策略(类似attention机制)。
最直白的思想,加上一个可学习的权重即可,如下:
其中wi可以是一个标量(对每一个特征),可以是一个向量(对每一个通道),也可以是一个多维度的tenor(对每一个像素)。
但是如果不对wi对限制容易导致训练不稳定,于是很自然的想到对每一个权重用softmax:
但是计算softmax速度较慢,于是作者提出了快速的限制方法:
为了保证weight大于0,weight前采用relu函数。以上图BiFPN结构中第6层为例:
PyTorch实现BiFPN模型:
import torch.nn as nn import torch.nn.functional as F from .module import ConvModule, xavier_init import torch class BIFPN(nn.Module): def __init__(self, in_channels, out_channels, num_outs, start_level=0, end_level=-1, stack=1, add_extra_convs=False, extra_convs_on_inputs=True, relu_before_extra_convs=False, no_norm_on_lateral=False, conv_cfg=None, norm_cfg=None, activation=None): super(BIFPN, self).__init__() assert isinstance(in_channels, list) self.in_channels = in_channels self.out_channels = out_channels self.num_ins = len(in_channels) self.num_outs = num_outs self.activation = activation self.relu_before_extra_convs = relu_before_extra_convs self.no_norm_on_lateral = no_norm_on_lateral self.stack = stack if end_level == -1: self.backbone_end_level = self.num_ins assert num_outs >= self.num_ins - start_level else: # if end_level < inputs, no extra level is allowed self.backbone_end_level = end_level assert end_level <= len(in_channels) assert num_outs == end_level - start_level self.start_level = start_level self.end_level = end_level self.add_extra_convs = add_extra_convs self.extra_convs_on_inputs = extra_convs_on_inputs self.lateral_convs = nn.ModuleList() self.fpn_convs = nn.ModuleList() self.stack_bifpn_convs = nn.ModuleList() for i in range(self.start_level, self.backbone_end_level): l_conv = ConvModule( in_channels[i], out_channels, 1, conv_cfg=conv_cfg, norm_cfg=norm_cfg if not self.no_norm_on_lateral else None, activation=self.activation, inplace=False) self.lateral_convs.append(l_conv) for ii in range(stack): self.stack_bifpn_convs.append(BiFPNModule(channels=out_channels, levels=self.backbone_end_level-self.start_level, conv_cfg=conv_cfg, norm_cfg=norm_cfg, activation=activation)) # add extra conv layers (e.g., RetinaNet) extra_levels = num_outs - self.backbone_end_level + self.start_level if add_extra_convs and extra_levels >= 1: for i in range(extra_levels): if i == 0 and self.extra_convs_on_inputs: in_channels = self.in_channels[self.backbone_end_level - 1] else: in_channels = out_channels extra_fpn_conv = ConvModule( in_channels, out_channels, 3, stride=2, padding=1, conv_cfg=conv_cfg, norm_cfg=norm_cfg, activation=self.activation, inplace=False) self.fpn_convs.append(extra_fpn_conv) self.init_weights() # default init_weights for conv(msra) and norm in ConvModule def init_weights(self): for m in self.modules(): if isinstance(m, nn.Conv2d): xavier_init(m, distribution='uniform') def forward(self, inputs): assert len(inputs) == len(self.in_channels) # build laterals laterals = [ lateral_conv(inputs[i + self.start_level]) for i, lateral_conv in enumerate(self.lateral_convs) ] # part 1: build top-down and down-top path with stack used_backbone_levels = len(laterals) for bifpn_module in self.stack_bifpn_convs: laterals = bifpn_module(laterals) outs = laterals # part 2: add extra levels if self.num_outs > len(outs): # use max pool to get more levels on top of outputs # (e.g., Faster R-CNN, Mask R-CNN) if not self.add_extra_convs: for i in range(self.num_outs - used_backbone_levels): outs.append(F.max_pool2d(outs[-1], 1, stride=2)) # add conv layers on top of original feature maps (RetinaNet) else: if self.extra_convs_on_inputs: orig = inputs[self.backbone_end_level - 1] outs.append(self.fpn_convs[0](orig)) else: outs.append(self.fpn_convs[0](outs[-1])) for i in range(1, self.num_outs - used_backbone_levels): if self.relu_before_extra_convs: outs.append(self.fpn_convs[i](F.relu(outs[-1]))) else: outs.append(self.fpn_convs[i](outs[-1])) return tuple(outs) class BiFPNModule(nn.Module): def __init__(self, channels, levels, init=0.5, conv_cfg=None, norm_cfg=None, activation=None, eps=0.0001): super(BiFPNModule, self).__init__() self.activation = activation self.eps = eps self.levels = levels self.bifpn_convs = nn.ModuleList() # weighted self.w1 = nn.Parameter(torch.Tensor(2, levels).fill_(init)) self.relu1 = nn.ReLU() self.w2 = nn.Parameter(torch.Tensor(3, levels - 2).fill_(init)) self.relu2 = nn.ReLU() for jj in range(2): for i in range(self.levels-1): # 1,2,3 fpn_conv = nn.Sequential( ConvModule( channels, channels, 3, padding=1, conv_cfg=conv_cfg, norm_cfg=norm_cfg, activation=self.activation, inplace=False) ) self.bifpn_convs.append(fpn_conv) # default init_weights for conv(msra) and norm in ConvModule def init_weights(self): for m in self.modules(): if isinstance(m, nn.Conv2d): xavier_init(m, distribution='uniform') def forward(self, inputs): assert len(inputs) == self.levels # build top-down and down-top path with stack levels = self.levels # w relu w1 = self.relu1(self.w1) w1 /= torch.sum(w1, dim=0) + self.eps # normalize w2 = self.relu2(self.w2) w2 /= torch.sum(w2, dim=0) + self.eps # normalize # build top-down idx_bifpn = 0 pathtd = inputs inputs_clone = [] for in_tensor in inputs: inputs_clone.append(in_tensor.clone()) for i in range(levels - 1, 0, -1): pathtd[i - 1] = (w1[0, i-1]*pathtd[i - 1] + w1[1, i-1]*F.interpolate( pathtd[i], scale_factor=2, mode='nearest'))/(w1[0, i-1] + w1[1, i-1] + self.eps) pathtd[i - 1] = self.bifpn_convs[idx_bifpn](pathtd[i - 1]) idx_bifpn = idx_bifpn + 1 # build down-top for i in range(0, levels - 2, 1): pathtd[i + 1] = (w2[0, i] * pathtd[i + 1] + w2[1, i] * F.max_pool2d(pathtd[i], kernel_size=2) + w2[2, i] * inputs_clone[i + 1])/(w2[0, i] + w2[1, i] + w2[2, i] + self.eps) pathtd[i + 1] = self.bifpn_convs[idx_bifpn](pathtd[i + 1]) idx_bifpn = idx_bifpn + 1 pathtd[levels - 1] = (w1[0, levels-1] * pathtd[levels - 1] + w1[1, levels-1] * F.max_pool2d( pathtd[levels - 2], kernel_size=2))/(w1[0, levels-1] + w1[1, levels-1] + self.eps) pathtd[levels - 1] = self.bifpn_convs[idx_bifpn](pathtd[levels - 1]) return pathtd
2.3、EfficientDet结构
组合了backbone(使用了EfficientNet)和BiFPN(特征网络)和Box prediction net,整个框架就是EfficientDet的基本模型,结构如下图:
主干网络采用的是 EfficientNet 网络,BiFPN 是基于其 3~7 层的特征图进行的,融合后的特征喂给一个分类网络和 box 网络,分类与 box 网络在所有特征级上权重是共享的。
PyTorch实现EfficientDet结构:
class EfficientDet(nn.Module): def __init__(self, num_classes, network='efficientdet-d0', D_bifpn=3, W_bifpn=88, D_class=3, is_training=True, threshold=0.01, iou_threshold=0.5): super(EfficientDet, self).__init__() self.backbone = EfficientNet.from_pretrained(MODEL_MAP[network]) self.is_training = is_training self.neck = BIFPN(in_channels=self.backbone.get_list_features()[-5:], out_channels=W_bifpn, stack=D_bifpn, num_outs=5) self.bbox_head = RetinaHead(num_classes=num_classes, in_channels=W_bifpn) self.anchors = Anchors() self.regressBoxes = BBoxTransform() self.clipBoxes = ClipBoxes() self.threshold = threshold self.iou_threshold = iou_threshold for m in self.modules(): if isinstance(m, nn.Conv2d): n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels m.weight.data.normal_(0, math.sqrt(2. / n)) elif isinstance(m, nn.BatchNorm2d): m.weight.data.fill_(1) m.bias.data.zero_() self.freeze_bn() self.criterion = FocalLoss() def forward(self, inputs): if self.is_training: inputs, annotations = inputs else: inputs = inputs x = self.extract_feat(inputs) outs = self.bbox_head(x) classification = torch.cat([out for out in outs[0]], dim=1) regression = torch.cat([out for out in outs[1]], dim=1) anchors = self.anchors(inputs) if self.is_training: return self.criterion(classification, regression, anchors, annotations) else: transformed_anchors = self.regressBoxes(anchors, regression) transformed_anchors = self.clipBoxes(transformed_anchors, inputs) scores = torch.max(classification, dim=2, keepdim=True)[0] scores_over_thresh = (scores > self.threshold)[0, :, 0] if scores_over_thresh.sum() == 0: print('No boxes to NMS') # no boxes to NMS, just return return [torch.zeros(0), torch.zeros(0), torch.zeros(0, 4)] classification = classification[:, scores_over_thresh, :] transformed_anchors = transformed_anchors[:, scores_over_thresh, :] scores = scores[:, scores_over_thresh, :] anchors_nms_idx = nms( transformed_anchors[0, :, :], scores[0, :, 0], iou_threshold=self.iou_threshold) nms_scores, nms_class = classification[0, anchors_nms_idx, :].max( dim=1) return [nms_scores, nms_class, transformed_anchors[0, anchors_nms_idx, :]]
2.4、模型复合扩张
主干网络部分:这部分直接把 EfficientNet 缩放拿过来用即可,即 EfficientNet B0-B6,借助其现成的 checkpoints,就不折腾了;
BiFPN 网络部分:这部分借鉴 EfficientNet,在 Channel 上直线指数级增加,在深度上线性增加,具体的缩放系数公式为:
Box/class 预测网络部分:其宽度与 BiFPN 部分保持一致,深度方面采用
图片分辨率部分: 因为特征提取选择的是 3~7 层,第 7 层的大小为原始图片的1/2^7,所以输入图像的大小必须是 128 的倍数
D7 明显是超出内存大小了,只是在 D6 基础上增加了分辨率大小。
2.5、EfficientDet结构总结
BiFPN和模型复合扩张策略都非常有效,BiFPN和综合平衡分辨率、深度和宽度提升性能。但是一方面BiFPN除了Feature map的加权组合是新的提的,PANet和shortcut的思路其他论文也都提过,另一方面就是平衡三者的方法完全是个经验值,并没有理论上的分析或者指导,可能最后还是要依靠NAS来给出最优的策略。
2.6、训练过程与测试结果
参考:
https://zhuanlan.zhihu.com/p/111115509
https://blog.csdn.net/weixin_37179744/article/details/103217305