学习总结
(1)每个优化器都是一个类,一定要进行实例化才能使用,比如:
class Net(nn.Moddule): ··· net = Net() optim = torch.optim.SGD(net.parameters(), lr=lr) optim.step()
(2)optimizer在一个神经网络的epoch中需要实现下面两个步骤:
梯度置零,梯度更新。
optimizer = torch.optim.SGD(net.parameters(), lr=1e-5) for epoch in range(EPOCH): ... optimizer.zero_grad() #梯度置零 loss = ... #计算loss loss.backward() #BP反向传播 optimizer.step() #梯度更新
一、优化器
深度学习的目标是通过不断改变网络参数,使得参数能够对输入做各种非线性变换拟合输出,本质上就是一个函数去寻找最优解,只不过这个最优解是一个矩阵,而如何快速求得这个最优解是深度学习研究的一个重点——以经典的resnet-50为例,它大约有2000万个系数需要进行计算,那么我们如何计算出来这么多的系数,有以下两种方法:
(1)第一种是最直接的暴力穷举一遍参数,这种方法的实施可能性基本为0,堪比愚公移山plus的难度。
(2)为了使求解参数过程更加快,人们提出了第二种办法,即就是是BP+优化器逼近求解。
因此,优化器就是根据网络反向传播的梯度信息来更新网络的参数,以起到降低loss函数计算值,使得模型输出更加接近真实标签。
二、Pytorch的优化器
Pytorch提供了一个优化器的库torch.optim,在这里面给我们提供了十种优化器。
torch.optim.ASGD
torch.optim.Adadelta
torch.optim.Adagrad
torch.optim.Adam
torch.optim.AdamW
torch.optim.Adamax
torch.optim.LBFGS
torch.optim.RMSprop
torch.optim.Rprop
torch.optim.SGD
torch.optim.SparseAdam
而以上这些优化算法均继承于Optimizer,下面我们先来看下所有优化器的基类Optimizer。定义如下:
class Optimizer(object): def __init__(self, params, defaults): self.defaults = defaults self.state = defaultdict(dict) self.param_groups = []
ptimizer
有三个属性:
defaults
:存储的是优化器的超参数,例子如下:
{'lr': 0.1, 'momentum': 0.9, 'dampening': 0, 'weight_decay': 0, 'nesterov': False}
state
:参数的缓存,例子如下
defaultdict(<class 'dict'>, {tensor([[ 0.3864, -0.0131], [-0.1911, -0.4511]], requires_grad=True): {'momentum_buffer': tensor([[0.0052, 0.0052], [0.0052, 0.0052]])}})
param_groups
:管理的参数组,是一个list,其中每个元素是一个字典,顺序是params,lr,momentum,dampening,weight_decay,nesterov,例子如下
[{'params': [tensor([[-0.1022, -1.6890], [-1.5116, -1.7846]], requires_grad=True)], 'lr': 1, 'momentum': 0, 'dampening': 0, 'weight_decay': 0, 'nesterov': False}]
Optimizer
还有以下的方法:
zero_grad()
:清空所管理参数的梯度,Pytorch的特性是张量的梯度不自动清零,因此每次反向传播后都需要清空梯度。
def zero_grad(self, set_to_none: bool = False): for group in self.param_groups: for p in group['params']: if p.grad is not None: #梯度不为空 if set_to_none: p.grad = None else: if p.grad.grad_fn is not None: p.grad.detach_() else: p.grad.requires_grad_(False) p.grad.zero_()# 梯度设置为0
step()
:执行一步梯度更新,参数更新
def step(self, closure): raise NotImplementedError
add_param_group()
:添加参数组
def add_param_group(self, param_group): assert isinstance(param_group, dict), "param group must be a dict" # 检查类型是否为tensor params = param_group['params'] if isinstance(params, torch.Tensor): param_group['params'] = [params] elif isinstance(params, set): raise TypeError('optimizer parameters need to be organized in ordered collections, but ' 'the ordering of tensors in sets will change between runs. Please use a list instead.') else: param_group['params'] = list(params) for param in param_group['params']: if not isinstance(param, torch.Tensor): raise TypeError("optimizer can only optimize Tensors, " "but one of the params is " + torch.typename(param)) if not param.is_leaf: raise ValueError("can't optimize a non-leaf Tensor") for name, default in self.defaults.items(): if default is required and name not in param_group: raise ValueError("parameter group didn't specify a value of required optimization parameter " + name) else: param_group.setdefault(name, default) params = param_group['params'] if len(params) != len(set(params)): warnings.warn("optimizer contains a parameter group with duplicate parameters; " "in future, this will cause an error; " "see github.com/pytorch/pytorch/issues/40967 for more information", stacklevel=3) # 上面好像都在进行一些类的检测,报Warning和Error param_set = set() for group in self.param_groups: param_set.update(set(group['params'])) if not param_set.isdisjoint(set(param_group['params'])): raise ValueError("some parameters appear in more than one parameter group") # 添加参数 self.param_groups.append(param_group)
load_state_dict()
:加载状态参数字典,可以用来进行模型的断点续训练,继续上次的参数进行训练
def load_state_dict(self, state_dict): r"""Loads the optimizer state. Arguments: state_dict (dict): optimizer state. Should be an object returned from a call to :meth:`state_dict`. """ # deepcopy, to be consistent with module API state_dict = deepcopy(state_dict) # Validate the state_dict groups = self.param_groups saved_groups = state_dict['param_groups'] if len(groups) != len(saved_groups): raise ValueError("loaded state dict has a different number of " "parameter groups") param_lens = (len(g['params']) for g in groups) saved_lens = (len(g['params']) for g in saved_groups) if any(p_len != s_len for p_len, s_len in zip(param_lens, saved_lens)): raise ValueError("loaded state dict contains a parameter group " "that doesn't match the size of optimizer's group") # Update the state id_map = {old_id: p for old_id, p in zip(chain.from_iterable((g['params'] for g in saved_groups)), chain.from_iterable((g['params'] for g in groups)))} def cast(param, value): r"""Make a deep copy of value, casting all tensors to device of param.""" ..... # Copy state assigned to params (and cast tensors to appropriate types). # State that is not assigned to params is copied as is (needed for # backward compatibility). state = defaultdict(dict) for k, v in state_dict['state'].items(): if k in id_map: param = id_map[k] state[param] = cast(param, v) else: state[k] = v # Update parameter groups, setting their 'params' value def update_group(group, new_group): ... param_groups = [ update_group(g, ng) for g, ng in zip(groups, saved_groups)] self.__setstate__({'state': state, 'param_groups': param_groups})
state_dict()
:获取优化器当前状态信息字典
def state_dict(self): r"""Returns the state of the optimizer as a :class:`dict`. It contains two entries: * state - a dict holding current optimization state. Its content differs between optimizer classes. * param_groups - a dict containing all parameter groups """ # Save order indices instead of Tensors param_mappings = {} start_index = 0 def pack_group(group): ...... param_groups = [pack_group(g) for g in self.param_groups] # Remap state to use order indices as keys packed_state = {(param_mappings[id(k)] if isinstance(k, torch.Tensor) else k): v for k, v in self.state.items()} return { 'state': packed_state, 'param_groups': param_groups, }
四、实验
五、训练与评估
完成了上述设定后就可以加载数据开始训练模型了。之前在DataLoader构建完成后介绍了如何从中读取数据,在训练过程中使用类似的操作即可,区别在于此时要用for循环读取DataLoader中的全部数据。
训练过程:
# -*- coding: utf-8 -*- """ Created on Sun Oct 17 12:04:24 2021 @author: 86493 """ # 训练状态,模型参数应该支持反向传播的修改 model.train() # 验证/测试状态,不应该修改模型参数 model.eval() # for循环读取DataLoader中全部数据 for data, label in train_loader: # 将数据放到GPU上用于后续计算,以.cuda为例 data, label = data.cuda(), label.cuda() # 开始用当前批次数据做训练时 # 应先将优化器的梯度置零 optimizer.zero_grad() # 将data送入模型中训练 output = model(data) # 根据预定义的criterion计算损失函数 loss = criterion(output, label) # 将loss反向传播回网络 loss.backward() # 用优化器更新模型参数 optimizer.step() # 后续还可以计算模型准确率等指标
完整的:
def train(epoch): model.train() train_loss = 0 for data, label in train_loader: # 将数据放到GPU上用于后续计算 data, label = data.cuda(), label.cuda() # 开始用当前批次数据训练时先清零优化器的梯度 optimizer.zero_grad() # 将data送入模型中训练 output = model(data) # 计算损失函数 loss = criterion(label, output) # 将loss反向传播回网络 loss.backward() # 用优化器更新模型参数 optimizer.step() train_loss += loss.item() * data.size(0) train_loss = train_loss / len(train_loader.dataset) print('Epoch:{} \tTraining Loss:{:.6f}'.format(epoch, train_loss))
验证/测试的流程基本与训练过程一致,不同点在于:
- 需要预先设置
torch.no_grad
,以及将model调至eval模式 - 不需要将优化器的梯度置零
- 不需要将loss反向回传到网络
- 不需要更新optimizer
验证的完整过程:
def val(epoch): model.eval() val_loss = 0 with torch.no_grad(): for data, label in val_loader: data, label =data.cuda(), label.cuda() output = model(data) preds = torch.argmax(output, 1) loss = criterion(output, label) val_loss += loss.item() * data.size(0) running_accu += torch.sum(preds == label.data) val_loss = val_loss / len(val_loader.dataset) print('Epoch: {} \tTraining Loss:{:.6f}'.format(epoch, val_loss))