3D convolution 最早应该是在“3D convolutional neural networks for human action”中被提出并用于行为识别的。该论文提出的模型尝试从空间和时间维度中提取特征,从而捕获在多个相邻帧中编码的运动信息。
3、我们提出通过增加具有作为高级运动特征计算的辅助输出的模型来规范3D CNN模型。我们进一步提出通过组合各种不同架构的输出来提高3D CNN模型的性能。
3D ConvNets 更适合学习时空特征,通过3D卷积和3D池化,可以对时间信息建模,而2D卷积只能在空间上学习特征。3D和2D的区别如下:
3D卷积核时间深度搜索。不同卷积核时间深度设置在UCF101测试集split-1上的精度。2D ConvNet效果最差,3×3×3卷积核的3D ConvNet在实验中表现最佳。
网络架构:上图的发现表明,3×3×3卷积核的均匀设置是3D ConvNets的最佳选择。这个发现与2D ConvNets一致。使用大型数据集,可以根据机器内存限制和计算承受能力,尽可能深入地训练具有3×3×3核的3D ConvNet。使用目前的GPU内存,我们设计了3D ConvNet,具有8个卷积层、5个池化层、两个全连接层,以及一个softmax输出层。网络架构如图3所示。为了简单起见,我们从现在开始将这个网络称为C3D。所有3D卷积滤波器均为3×3×3,步长为1×1×1。为了保持早期的时间信息设置pool1核大小为1×2×2、步长1×2×2,其余所有3D池化层均为2×2×2,步长为2×2×2。每个全连接层有4096个输出单元。
import torch import torch.nn as nn from mypath import Path class C3D(nn.Module): """ The C3D network. """ def __init__(self, num_classes, pretrained=False): super(C3D, self).__init__() self.conv1 = nn.Conv3d(3, 64, kernel_size=(3, 3, 3), padding=(1, 1, 1)) self.pool1 = nn.MaxPool3d(kernel_size=(1, 2, 2), stride=(1, 2, 2)) self.conv2 = nn.Conv3d(64, 128, kernel_size=(3, 3, 3), padding=(1, 1, 1)) self.pool2 = nn.MaxPool3d(kernel_size=(2, 2, 2), stride=(2, 2, 2)) self.conv3a = nn.Conv3d(128, 256, kernel_size=(3, 3, 3), padding=(1, 1, 1)) self.conv3b = nn.Conv3d(256, 256, kernel_size=(3, 3, 3), padding=(1, 1, 1)) self.pool3 = nn.MaxPool3d(kernel_size=(2, 2, 2), stride=(2, 2, 2)) self.conv4a = nn.Conv3d(256, 512, kernel_size=(3, 3, 3), padding=(1, 1, 1)) self.conv4b = nn.Conv3d(512, 512, kernel_size=(3, 3, 3), padding=(1, 1, 1)) self.pool4 = nn.MaxPool3d(kernel_size=(2, 2, 2), stride=(2, 2, 2)) self.conv5a = nn.Conv3d(512, 512, kernel_size=(3, 3, 3), padding=(1, 1, 1)) self.conv5b = nn.Conv3d(512, 512, kernel_size=(3, 3, 3), padding=(1, 1, 1)) self.pool5 = nn.MaxPool3d(kernel_size=(2, 2, 2), stride=(2, 2, 2), padding=(0, 1, 1)) self.fc6 = nn.Linear(8192, 4096) self.fc7 = nn.Linear(4096, 4096) self.fc8 = nn.Linear(4096, num_classes) self.dropout = nn.Dropout(p=0.5) self.relu = nn.ReLU() self.__init_weight() if pretrained: self.__load_pretrained_weights() def forward(self, x): # print ('1:',x.size()) x = self.relu(self.conv1(x)) # print ('2:',x.size()) x = self.pool1(x) # print ('3:',x.size()) x = self.relu(self.conv2(x)) # print ('4:',x.size()) x = self.pool2(x) # print ('5:',x.size()) x = self.relu(self.conv3a(x)) # print ('6:',x.size()) x = self.relu(self.conv3b(x)) # print ('7:',x.size()) x = self.pool3(x) # print ('8:',x.size()) x = self.relu(self.conv4a(x)) # print ('9:',x.size()) x = self.relu(self.conv4b(x)) # print ('10:',x.size()) x = self.pool4(x) # print ('11:',x.size()) x = self.relu(self.conv5a(x)) # print ('12:',x.size()) x = self.relu(self.conv5b(x)) # print ('13:',x.size()) x = self.pool5(x) # print ('14:',x.size()) x = x.view(-1, 8192) # print ('15:',x.size()) x = self.relu(self.fc6(x)) # print ('16:',x.size()) x = self.dropout(x) x = self.relu(self.fc7(x)) x = self.dropout(x) logits = self.fc8(x) # print ('17:',logits.size()) return logits def __load_pretrained_weights(self): """Initialiaze network.""" corresp_name = { # Conv1 "features.0.weight": "conv1.weight", "features.0.bias": "conv1.bias", # Conv2 "features.3.weight": "conv2.weight", "features.3.bias": "conv2.bias", # Conv3a "features.6.weight": "conv3a.weight", "features.6.bias": "conv3a.bias", # Conv3b "features.8.weight": "conv3b.weight", "features.8.bias": "conv3b.bias", # Conv4a "features.11.weight": "conv4a.weight", "features.11.bias": "conv4a.bias", # Conv4b "features.13.weight": "conv4b.weight", "features.13.bias": "conv4b.bias", # Conv5a "features.16.weight": "conv5a.weight", "features.16.bias": "conv5a.bias", # Conv5b "features.18.weight": "conv5b.weight", "features.18.bias": "conv5b.bias", # fc6 "classifier.0.weight": "fc6.weight", "classifier.0.bias": "fc6.bias", # fc7 "classifier.3.weight": "fc7.weight", "classifier.3.bias": "fc7.bias", } p_dict = torch.load(Path.model_dir()) s_dict = self.state_dict() for name in p_dict: if name not in corresp_name: continue s_dict[corresp_name[name]] = p_dict[name] self.load_state_dict(s_dict) def __init_weight(self): for m in self.modules(): if isinstance(m, nn.Conv3d): # n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels # m.weight.data.normal_(0, math.sqrt(2. / n)) torch.nn.init.kaiming_normal_(m.weight) elif isinstance(m, nn.BatchNorm3d): m.weight.data.fill_(1) m.bias.data.zero_() def get_1x_lr_params(model): """ This generator returns all the parameters for conv and two fc layers of the net. """ b = [model.conv1, model.conv2, model.conv3a, model.conv3b, model.conv4a, model.conv4b, model.conv5a, model.conv5b, model.fc6, model.fc7] for i in range(len(b)): for k in b[i].parameters(): if k.requires_grad: yield k def get_10x_lr_params(model): """ This generator returns all the parameters for the last fc layer of the net. """ b = [model.fc8] for j in range(len(b)): for k in b[j].parameters(): if k.requires_grad: yield k if __name__ == "__main__": inputs = torch.rand(1, 3, 16, 112, 112) net = C3D(num_classes=101, pretrained=True) outputs = net.forward(inputs) print(outputs.size())
- C3D卷积网络将完整的视频帧作为输入,并不依赖于任何处理,可以轻松地扩展到大数据集。