1.5 ResNet
随着网络深度的增加,模型的准确度也在同步提高。但是,网络加深的同时也一直无法摆脱一个问题:梯度消失现象越发明显。在梯度回传的过程中,比较靠前的梯度会很小,这意味着某些层基本上得不到更新,因此增加深度也就变得没有意义,反而徒增计算量。而且随着网络深度的增加,参数量更大,优化也变得更加困难。而残差网络的出现极大地缓解了这个问题。
微软亚洲研究院的何凯明等人提出了深度残差网络(Deep Residual Network),它在当年的ImageNet竞赛中获得冠军。该网络简称为ResNet(由算法Residual命名),层数达到了152层,top-5的错误率降到了3.57%,而2014年的冠军GoogLeNet的错误率是6.7%。ResNet的结构如图所示:
残差网络通过增加一个Identity Mapping(恒等映射),将当前的输出直接跨层输送到下一层网络,相当于走一个捷径,跳过了本层的运算,称为Skip Connection,同时在后向传播计算的过程中,将下一层网络的梯度直接传给上一层网络,极大地缓解了深层网络的梯度消失问题。其表达式为:
ResNet通过逐层保留的方式,保持了模型各层级的特征信息,一定程度上解决了梯度消失的问题,而且快捷连接的方式也很少产生额外的参数。这些优点促使ResNet逐渐成为各大检测、分割等算法的基础框架。
以下是基于PyTorch实现的ResNet代码:
import torch.nn as nn import math def conv3x3(in_planes, out_planes, stride=1): # "3x3 convolution with padding" return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride, padding=1, bias=False) class BasicBlock(nn.Module): expansion = 1 def __init__(self, inplanes, planes, stride=1, downsample=None): super(BasicBlock, self).__init__() m = OrderedDict() m['conv1'] = conv3x3(inplanes, planes, stride) m['bn1'] = nn.BatchNorm2d(planes) m['relu1'] = nn.ReLU(inplace=True) m['conv2'] = conv3x3(planes, planes) m['bn2'] = nn.BatchNorm2d(planes) self.group1 = nn.Sequential(m) self.relu= nn.Sequential(nn.ReLU(inplace=True)) self.downsample = downsample def forward(self, x): if self.downsample is not None: residual = self.downsample(x) else: residual = x out = self.group1(x) + residual out = self.relu(out) return out class Bottleneck(nn.Module): expansion = 4 def __init__(self, inplanes, planes, stride=1, downsample=None): super(Bottleneck, self).__init__() m = OrderedDict() m['conv1'] = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False) m['bn1'] = nn.BatchNorm2d(planes) m['relu1'] = nn.ReLU(inplace=True) m['conv2'] = nn.Conv2d(planes, planes, kernel_size=3, stride=stride, padding=1, bias=False) m['bn2'] = nn.BatchNorm2d(planes) m['relu2'] = nn.ReLU(inplace=True) m['conv3'] = nn.Conv2d(planes, planes * 4, kernel_size=1, bias=False) m['bn3'] = nn.BatchNorm2d(planes * 4) self.group1 = nn.Sequential(m) self.relu= nn.Sequential(nn.ReLU(inplace=True)) self.downsample = downsample def forward(self, x): if self.downsample is not None: residual = self.downsample(x) else: residual = x out = self.group1(x) + residual out = self.relu(out) return out class ResNet(nn.Module): def __init__(self, block, layers, num_classes=1000): self.inplanes = 64 super(ResNet, self).__init__() m = OrderedDict() m['conv1'] = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3, bias=False) m['bn1'] = nn.BatchNorm2d(64) m['relu1'] = nn.ReLU(inplace=True) m['maxpool'] = nn.MaxPool2d(kernel_size=3, stride=2, padding=1) self.group1= nn.Sequential(m) self.layer1 = self._make_layer(block, 64, layers[0]) self.layer2 = self._make_layer(block, 128, layers[1], stride=2) self.layer3 = self._make_layer(block, 256, layers[2], stride=2) self.layer4 = self._make_layer(block, 512, layers[3], stride=2) self.avgpool = nn.Sequential(nn.AvgPool2d(7)) self.group2 = nn.Sequential(OrderedDict([('fc', nn.Linear(512 * block.expansion, num_classes))])) for m in self.modules(): if isinstance(m, nn.Conv2d): n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels m.weight.data.normal_(0, math.sqrt(2. / n)) elif isinstance(m, nn.BatchNorm2d): m.weight.data.fill_(1) m.bias.data.zero_() def _make_layer(self, block, planes, blocks, stride=1): downsample = None if stride != 1 or self.inplanes != planes * block.expansion: ownsample = nn.Sequential(nn.Conv2d(self.inplanes, planes * block.expansion, kernel_size=1, stride=stride, bias=False),nn.BatchNorm2d(planes * block.expansion),) layers = [] layers.append(block(self.inplanes, planes, stride, downsample)) self.inplanes = planes * block.expansion for i in range(1, blocks): layers.append(block(self.inplanes, planes)) return nn.Sequential(*layers) def forward(self, x): x = self.group1(x) x = self.layer1(x) x = self.layer2(x) x = self.layer3(x) x = self.layer4(x) x = self.avgpool(x) x = x.view(x.size(0), -1) x = self.group2(x) return x def resnet18(pretrained=False, model_root=None, **kwargs): model = ResNet(BasicBlock, [2, 2, 2, 2], **kwargs)
1.6 DenseNet
受到ResNet的启发,DenseNet于2016年被提出来。DenseNet将每个卷积层网络的输入变为前面所有网络输出的拼接。这种稠密的方式使得每层都可以利用之前学习到的所有特征,无须重复学习。同时,仿照ResNet的结构,梯度可以更好地传播,训练深层网络也变得更加方便。DenseNet结构如图所示。
相较于ResNet,DenseNet又将低层特征继续向前拼接,主要公式如下:
细节配置表
需要注意的是,表中DenseNet主要是由Dense Block和Transition Layer组成的,其中每个conv都是BatchNorm+ReLU+Conv的组合。
以下是基于PyTorch实现的DenseNet代码:
import torch import torch.nn as nn from collections import OrderedDict class _DenseLayer(nn.Sequential): def __init__(self, in_channels, growth_rate, bn_size): super(_DenseLayer, self).__init__() self.add_module('norm1', nn.BatchNorm2d(in_channels)) self.add_module('relu1', nn.ReLU(inplace=True)) self.add_module('conv1', nn.Conv2d(in_channels, bn_size * growth_rate, kernel_size=1, stride=1, bias=False)) self.add_module('norm2', nn.BatchNorm2d(bn_size*growth_rate)) self.add_module('relu2', nn.ReLU(inplace=True)) self.add_module('conv2', nn.Conv2d(bn_size*growth_rate, growth_rate, kernel_size=3, stride=1, padding=1, bias=False)) # 重载forward函数 def forward(self, x): new_features = super(_DenseLayer, self).forward(x) return torch.cat([x, new_features], 1) class _DenseBlock(nn.Sequential): def __init__(self, num_layers, in_channels, bn_size, growth_rate): super(_DenseBlock, self).__init__() for i in range(num_layers): self.add_module('denselayer%d' % (i+1), _DenseLayer(in_channels+growth_rate*i, growth_rate, bn_size)) class _Transition(nn.Sequential): def __init__(self, in_channels, out_channels): super(_Transition, self).__init__() self.add_module('norm', nn.BatchNorm2d(in_channels)) self.add_module('relu', nn.ReLU(inplace=True)) self.add_module('conv', nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=1, bias=False)) self.add_module('pool', nn.AvgPool2d(kernel_size=2, stride=2)) class DenseNet_BC(nn.Module): def __init__(self, growth_rate=12, block_config=(6,12,24,16), bn_size=4, theta=0.5, num_classes=10): super(DenseNet_BC, self).__init__() # 初始的卷积为filter:2倍的growth_rate num_init_feature = 2 * growth_rate # 表示cifar-10 if num_classes == 10: self.features = nn.Sequential(OrderedDict([ ('conv0', nn.Conv2d(3, num_init_feature, kernel_size=3, stride=1, padding=1, bias=False)), ])) else: self.features = nn.Sequential(OrderedDict([ ('conv0', nn.Conv2d(3, num_init_feature, kernel_size=7, stride=2, padding=3, bias=False)), ('norm0', nn.BatchNorm2d(num_init_feature)), ('relu0', nn.ReLU(inplace=True)), ('pool0', nn.MaxPool2d(kernel_size=3, stride=2, padding=1)) ])) num_feature = num_init_feature for i, num_layers in enumerate(block_config): self.features.add_module('denseblock%d' % (i+1), _DenseBlock(num_layers, num_feature, bn_size, growth_rate)) num_feature = num_feature + growth_rate * num_layers if i != len(block_config)-1: self.features.add_module('transition%d' % (i + 1), _Transition(num_feature, int(num_feature * theta))) num_feature = int(num_feature * theta) self.features.add_module('norm5', nn.BatchNorm2d(num_feature)) self.features.add_module('relu5', nn.ReLU(inplace=True)) self.features.add_module('avg_pool', nn.AdaptiveAvgPool2d((1, 1))) self.classifier = nn.Linear(num_feature, num_classes) for m in self.modules(): if isinstance(m, nn.Conv2d): nn.init.kaiming_normal_(m.weight) elif isinstance(m, nn.BatchNorm2d): nn.init.constant_(m.weight, 1) nn.init.constant_(m.bias, 0) elif isinstance(m, nn.Linear): nn.init.constant_(m.bias, 0) def forward(self, x): features = self.features(x) out = features.view(features.size(0), -1) out = self.classifier(out) return out # DenseNet_BC for ImageNet def DenseNet121(): return DenseNet_BC(growth_rate=32, block_config=(6, 12, 24, 16), num_classes=1000) def DenseNet169(): return DenseNet_BC(growth_rate=32, block_config=(6, 12, 32, 32), num_classes=1000) def DenseNet201(): return DenseNet_BC(growth_rate=32, block_config=(6, 12, 48, 32), num_classes=1000) def DenseNet161(): return DenseNet_BC(growth_rate=48, block_config=(6, 12, 36, 24), num_classes=1000,) # DenseNet_BC for cifar def densenet_BC_100(): return DenseNet_BC(growth_rate=12, block_config=(16, 16, 16)) def test(): net = densenet_cifar() x = torch.randn(2,3,32,32) y = net(x) print(y.size()) test()
1.7 SENet
SENet受到近几年Attention思想的启发,其主要思想是对每个输出通道(Channel)都预测一个权重,然后对每个通道进行加权,且是在2D空间做卷积。从本质上来说,其只对图像的空间信息进行建模,并没有对通道之间的信息建模,所以下面尝试对通道之间的信息进行建模。
SENet的基本结构如图所示:
对于每个输出通道,先执行系列的卷积Pooling操作之后,得到C×H×W大小的特征图,然后执行Squeeze和Excitation操作:
- Squeeze:对C×H×W特征图执行Global Average Pooling操作,得到1×1×C大小的特征图,这个特征图可以理解为具有全局的感受野。
- Excitation:使用一个全连接神经网络,对Squeeze之后的结果做非线性变换。
- 特征的重新标定:将Excitation得到的结果作为权重,使其与输入特征相乘。
以下是基于PyTorch实现的SENet代码:
import torch import torch.nn as nn import torch.nn.functional as F class BasicBlock(nn.Module): def __init__(self, in_planes, planes, stride=1): super(BasicBlock, self).__init__() self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=3, stride=stride, padding=1, bias=False) self.bn1 = nn.BatchNorm2d(planes) self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=1, padding=1, bias=False) self.bn2 = nn.BatchNorm2d(planes) self.shortcut = nn.Sequential() if stride != 1 or in_planes != planes: self.shortcut = nn.Sequential( nn.Conv2d(in_planes, planes, kernel_size=1, stride=stride, bias=False), nn.BatchNorm2d(planes) ) # SE layers self.fc1 = nn.Conv2d(planes, planes//16, kernel_size=1) # Use nn.Conv2d instead of nn.Linear self.fc2 = nn.Conv2d(planes//16, planes, kernel_size=1) def forward(self, x): out = F.relu(self.bn1(self.conv1(x))) out = self.bn2(self.conv2(out)) # Squeeze w = F.avg_pool2d(out, out.size(2)) w = F.relu(self.fc1(w)) w = F.sigmoid(self.fc2(w)) # Excitation out = out * w # New broadcasting feature from v0.2! out += self.shortcut(x) out = F.relu(out) return out class PreActBlock(nn.Module): def __init__(self, in_planes, planes, stride=1): super(PreActBlock, self).__init__() self.bn1 = nn.BatchNorm2d(in_planes) self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=3, stride=stride, padding=1, bias=False) self.bn2 = nn.BatchNorm2d(planes) self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=1, padding=1, bias=False) if stride != 1 or in_planes != planes: self.shortcut = nn.Sequential(nn.Conv2d(in_planes, planes, kernel_size=1, stride=stride, bias=False)) # SE layers self.fc1 = nn.Conv2d(planes, planes//16, kernel_size=1) self.fc2 = nn.Conv2d(planes//16, planes, kernel_size=1) def forward(self, x): out = F.relu(self.bn1(x)) shortcut = self.shortcut(out) if hasattr(self, 'shortcut') else x out = self.conv1(out) out = self.conv2(F.relu(self.bn2(out))) # Squeeze w = F.avg_pool2d(out, out.size(2)) w = F.relu(self.fc1(w)) w = F.sigmoid(self.fc2(w)) # Excitation out = out * w out += shortcut return out class SENet(nn.Module): def __init__(self, block, num_blocks, num_classes=10): super(SENet, self).__init__() self.in_planes = 64 self.conv1 = nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1, bias=False) self.bn1 = nn.BatchNorm2d(64) self.layer1 = self._make_layer(block, 64, num_blocks[0], stride=1) self.layer2 = self._make_layer(block, 128, num_blocks[1], stride=2) self.layer3 = self._make_layer(block, 256, num_blocks[2], stride=2) self.layer4 = self._make_layer(block, 512, num_blocks[3], stride=2) self.linear = nn.Linear(512, num_classes) def _make_layer(self, block, planes, num_blocks, stride): strides = [stride] + [1]*(num_blocks-1) layers = [] for stride in strides: layers.append(block(self.in_planes, planes, stride)) self.in_planes = planes return nn.Sequential(*layers) def forward(self, x): out = F.relu(self.bn1(self.conv1(x))) out = self.layer1(out) out = self.layer2(out) out = self.layer3(out) out = self.layer4(out) out = F.avg_pool2d(out, 4) out = out.view(out.size(0), -1) out = self.linear(out) return out def SENet18(): return SENet(PreActBlock, [2,2,2,2]) def test(): net = SENet18() y = net(torch.randn(1,3,32,32)) print(y.size()) # test()
1.8 SKNet
SKNet针对卷积核的注意力机制研究,不同大小的感受视野(卷积核)对于不同尺度(远近、大小)的目标会有不同的效果。尽管比如Inception这样的增加了多个卷积核来适应不同尺度图像,但是一旦训练完成后,参数就固定了,这样多尺度信息就会被全部使用了(每个卷积核的权重相同)。
SKNet提出了一种机制,即卷积核的重要性,即不同的图像能够得到具有不同重要性的卷积核。据作者说,该模块在超分辨率任务上有很大提升,并且论文中的实验也证实了在分类任务上有很好的表现。SKNet对不同图像使用的卷积核权重不同,即一种针对不同尺度的图像动态生成卷积核。
整体结构如下图所示:
下图为GiantPandaCV公众号作者根据代码重画的网络图
import torch from torch import nn #被替换的3*3卷积 class SKConv(nn.Module): def __init__(self, features, WH, M, G, r, stride=1 ,L=32): """ Constructor Args: features: input channel dimensionality. WH: input spatial dimensionality, used for GAP kernel size. M: the number of branchs. G: num of convolution groups. r: the radio for compute d, the length of z. stride: stride, default 1. L: the minimum dim of the vector z in paper, default 32. """ super(SKConv, self).__init__() d = max(int(features/r), L) self.M = M self.features = features self.convs = nn.ModuleList([]) for i in range(M): self.convs.append(nn.Sequential( nn.Conv2d(features, features, kernel_size=3+i*2, stride=stride, padding=1+i, groups=G), nn.BatchNorm2d(features), nn.ReLU(inplace=False) )) self.gap = nn.AvgPool2d(int(WH/stride)) self.fc = nn.Linear(features, d) self.fcs = nn.ModuleList([]) for i in range(M): self.fcs.append( nn.Linear(d, features) ) self.softmax = nn.Softmax(dim=1) def forward(self, x): for i, conv in enumerate(self.convs): fea = conv(x).unsqueeze_(dim=1) if i == 0: feas = fea else: feas = torch.cat([feas, fea], dim=1) fea_U = torch.sum(feas, dim=1) fea_s = self.gap(fea_U).squeeze_() fea_z = self.fc(fea_s) for i, fc in enumerate(self.fcs): vector = fc(fea_z).unsqueeze_(dim=1) if i == 0: attention_vectors = vector else: attention_vectors = torch.cat([attention_vectors, vector], dim=1) attention_vectors = self.softmax(attention_vectors) attention_vectors = attention_vectors.unsqueeze(-1).unsqueeze(-1) fea_v = (feas * attention_vectors).sum(dim=1) return fea_v #新的残差块结构 class SKUnit(nn.Module): def __init__(self, in_features, out_features, WH, M, G, r, mid_features=None, stride=1, L=32): """ Constructor Args: in_features: input channel dimensionality. out_features: output channel dimensionality. WH: input spatial dimensionality, used for GAP kernel size. M: the number of branchs. G: num of convolution groups. r: the radio for compute d, the length of z. mid_features: the channle dim of the middle conv with stride not 1, default out_features/2. stride: stride. L: the minimum dim of the vector z in paper. """ super(SKUnit, self).__init__() if mid_features is None: mid_features = int(out_features/2) self.feas = nn.Sequential( nn.Conv2d(in_features, mid_features, 1, stride=1), nn.BatchNorm2d(mid_features), SKConv(mid_features, WH, M, G, r, stride=stride, L=L), nn.BatchNorm2d(mid_features), nn.Conv2d(mid_features, out_features, 1, stride=1), nn.BatchNorm2d(out_features) ) if in_features == out_features: # when dim not change, in could be added diectly to out self.shortcut = nn.Sequential() else: # when dim not change, in should also change dim to be added to out self.shortcut = nn.Sequential( nn.Conv2d(in_features, out_features, 1, stride=stride), nn.BatchNorm2d(out_features) ) def forward(self, x): fea = self.feas(x) return fea + self.shortcut(x) class SKNet(nn.Module): def __init__(self, class_num): super(SKNet, self).__init__() self.basic_conv = nn.Sequential( nn.Conv2d(3, 64, 3, padding=1), nn.BatchNorm2d(64) ) # 32x32 self.stage_1 = nn.Sequential( SKUnit(64, 256, 32, 2, 8, 2, stride=2), nn.ReLU(), SKUnit(256, 256, 32, 2, 8, 2), nn.ReLU(), SKUnit(256, 256, 32, 2, 8, 2), nn.ReLU() ) # 32x32 self.stage_2 = nn.Sequential( SKUnit(256, 512, 32, 2, 8, 2, stride=2), nn.ReLU(), SKUnit(512, 512, 32, 2, 8, 2), nn.ReLU(), SKUnit(512, 512, 32, 2, 8, 2), nn.ReLU() ) # 16x16 self.stage_3 = nn.Sequential( SKUnit(512, 1024, 32, 2, 8, 2, stride=2), nn.ReLU(), SKUnit(1024, 1024, 32, 2, 8, 2), nn.ReLU(), SKUnit(1024, 1024, 32, 2, 8, 2), nn.ReLU() ) # 8x8 self.pool = nn.AvgPool2d(8) self.classifier = nn.Sequential( nn.Linear(1024, class_num), # nn.Softmax(dim=1) ) def forward(self, x): fea = self.basic_conv(x) fea = self.stage_1(fea) fea = self.stage_2(fea) fea = self.stage_3(fea) fea = self.pool(fea) fea = torch.squeeze(fea) fea = self.classifier(fea) return fea if __name__=='__main__': x = torch.rand(8,64,32,32) conv = SKConv(64, 32, 3, 8, 2) out = conv(x) criterion = nn.L1Loss() loss = criterion(out, x) loss.backward() print('out shape : {}'.format(out.shape)) print('loss value : {}'.format(loss))
1.9 ResNeSt-Net
ResNeSt是基于ResNet,引入了Split-Attention块,可以跨不同的Feature-map组实现Feature-map注意力。Split-Attention块是一个计算单元,由Feature-map组和分割注意力操作组成。
ResNeSt block 将输入分为K个,每一个记为Cardinal1-k,然后又将每个Cardinal拆分成R个,每一个记为Split1-r,所以总共有G=K*R个组。