在训练神经网络时,我们希望能够直观地训练情况,例如损失函数的曲线
、输入的图像
、模型精度
等信息,这些信息可以帮助我们更好地监督网络的训练过程,并为参数优化提供方向和依据。
其实我们可以有个更容易地实现方式就是定义一个列表,然后将每个epoch的训练结果添加到列表中,待模型训练完成之后,使用这个列表中的数据进行可视化以及绘图操作等。
本文提供一个更为专业的操作,它是一个常用的可视化工具:TensorBoard
PyTorch已经内置了TensorBoard的相关接口,用户在安装后便可调用相关接口进行数据可视化
。
下面我将使用LeNet训练MNIST为例,来讲解tensorboard如何使用。
加载数据
import torch import torch.nn as nn import torch.nn.functional as F import torchvision from torch.utils.tensorboard import SummaryWriter from torchvision import transforms as T from tqdm import tqdm data_transform = T.Compose([ T.RandomResizedCrop(32), T.ToTensor(), ]) train_dataset = torchvision.datasets.MNIST('./', train=True, transform=data_transform) val_dataset = torchvision.datasets.MNIST('./', train=False, transform=data_transform) train_loader = torch.utils.data.DataLoader(train_dataset, 32) val_loader = torch.utils.data.DataLoader(val_dataset, 32)
定义网络
class LeNet5(nn.Module): def __init__(self,num_class=10): super(LeNet5,self).__init__() self.conv1 = nn.Conv2d(1, 6, 5) self.pool1 = nn.AvgPool2d((2, 2)) self.conv2 = nn.Conv2d(6, 16, 5) self.pool2 = nn.AvgPool2d((2, 2)) self.conv3 = nn.Conv2d(16, 120, 5) self.relu = nn.ReLU() self.fc1 = nn.Linear(120, 84) self.fc2 = nn.Linear(84, num_class) def forward(self, x): # x: torch.Size([1, 3, 32, 32]) x = self.conv1(x) # torch.Size([1, 6, 28, 28]) x = self.relu(x) x = self.pool1(x) # torch.Size([1, 6, 14, 14]) x = self.conv2(x) # torch.Size([1, 16, 10, 10]) x = self.relu(x) x = self.pool2(x) # torch.Size([1, 16, 5, 5]) x = self.conv3(x) # torch.Size([1, 120, 1, 1]) x = self.relu(x) x = x.flatten(start_dim=1) # torch.Size([1, 120]) x = self.fc1(x) # torch.Size([1, 84]) x = self.relu(x) x = self.fc2(x) # torch.Size([1, 5]) return x
定义模型等组件
model = LeNet5(10) loss_function = nn.CrossEntropyLoss() optimizer = torch.optim.Adam(model.parameters(), 0.003) # 日志信息保存路径 logger = SummaryWriter(log_dir='./log')
模型训练
for epoch in range(50): train_accuracy = 0 train_loss = 0 val_accuracy = 0 val_loss = 0 epoch_acc_count = 0 count = 0 running_loss = 0 model.train() for data in tqdm(train_loader): images, labels = data optimizer.zero_grad() output = model(images) loss = loss_function(output, labels) loss.backward() optimizer.step() running_loss += loss.item() # 计算每个epoch正确的个数 epoch_acc_count += (output.argmax(axis=1) == labels.view(-1)).sum() count += len(images) # 写入日志信息 logger.add_scalar('train_loss', running_loss, epoch + 1) logger.add_scalar('train_accuracy', epoch_acc_count / count, epoch + 1) running_loss = 0 epoch_acc_count = 0 count = 0 model.eval() for data in tqdm(val_loader): images, labels = data optimizer.zero_grad() output = model(images) loss = loss_function(output, labels) loss.backward() optimizer.step() running_loss += loss.item() # 计算每个epoch正确的个数 epoch_acc_count += (output.argmax(axis=1) == labels.view(-1)).sum() count += len(images) logger.add_scalar('val_loss', running_loss, epoch + 1) logger.add_scalar('val_accuracy', epoch_acc_count / count, epoch + 1)
命令启动TensorBoard
通过下面的命令行参数即可启动tensorboard,log_path就是我们模型训练时日志保存的路径。
tensorboard --logdir=log_path