简单的说,线性回归预测是基于某个变量 X (自变量)来预测变量 Y (因变量)的值,当然前提是 X 和 Y 之间存在线性关系。这两个变量之间的线性关系可以用直线表示(称为回归线)。
我们收集一系列的真实数据,例如多栋房屋的真实价格和对应的面积、房龄。我们希望在这个数据集上面来拟合模型参数使模型的预测价格与真实价格的误差达到最小。在ML术语中,数据集被称为训练集(training set),一栋房屋被称为一个样本(sample),其真实售出价格叫作标签(label),用来预测标签的两个因素叫作特征(feature)。
在模型训练中,我们需要计算价格预测值与真实值之间的误差。一个常用的选择是平方函数。它在评估索引为 的样本误差的表达式为:
优化函数 - 随机梯度下降
当模型和损失函数形式较为简单时,误差最小化问题的解可以直接用公式表达出来,这类解叫作解析解(analytical solution)。本节使用的线性回归和平方误差刚好属于这个范畴。还有一类模型并没有解析解,只能通过优化算法有限次迭代来尽可能降低损失函数的值。这类解叫作数值解(numerical solution)。
求数值解的优化算法中,小批量随机梯度下降(mini-batch stochastic gradient descent)在深度学习中被广泛使用。先初始化模型参数的初始值;然后对参数进行多次迭代,使每次迭代都降低损失函数的值。在每次迭代中,先随机均匀采样一个由固定数目训练数据样本所组成的小批量(mini-batch),然后求小批量中数据样本的平均损失有关模型参数的导数(梯度),最后用此结果与预先设定的一个正数的乘积作为模型参数在本次迭代的减小量。
学习率: 代表在每次优化中,能够学习的步长的大小
批量大小: 是小批量计算中的批量大小batch size
# import packages and modules %matplotlib inline import torch from IPython import display from matplotlib import pyplot as plt import numpy as np import random print(torch.__version__) 输出:1.3.0
# set input feature number num_inputs = 2 # set example number num_examples = 1000 # set true weight and bias in order to generate corresponded label true_w = [2, -3.4] true_b = 4.2 features = torch.randn(num_examples, num_inputs, dtype=torch.float32) labels = true_w[0] * features[:, 0] + true_w[1] * features[:, 1] + true_b labels += torch.tensor(np.random.normal(0, 0.01, size=labels.size()), dtype=torch.float32)
plt.scatter(features[:, 1].numpy(), labels.numpy(), 1);
features = torch.randn(num_examples, num_inputs, dtype=torch.float32) print(features) 输出:tensor([[ 0.0908, -0.8646], [-1.6370, 1.6305], [-0.1965, 0.8613], ..., [-0.9776, 0.0575], [ 1.9371, -0.1497], [-0.1417, -1.0046]])
def data_iter(batch_size, features, labels): num_examples = len(features) indices = list(range(num_examples)) random.shuffle(indices) # random read 10 samples for i in range(0, num_examples, batch_size): j = torch.LongTensor(indices[i: min(i + batch_size, num_examples)]) # the last time may be not enough for a whole batch yield features.index_select(0, j), labels.index_select(0, j) batch_size = 10 for X, y in data_iter(batch_size, features, labels): print(X, '\n', y) break 输出:tensor([[ 1.3591, 0.6950], [ 0.5206, -0.2726], [-0.6639, 0.9716], [ 2.7164, -0.6513], [-1.0642, 1.9331], [-2.2240, -0.3616], [-0.9094, 0.6691], [-0.2991, 0.2488], [ 1.8312, 0.2209], [ 0.2833, -1.1672]]) tensor([6.9694, 6.0005, 9.5797, 0.6944, 4.1964, 6.8519, 2.5178, 4.4217, 5.4679, 9.9754])
w = torch.tensor(np.random.normal(0, 0.01, (num_inputs, 1)), dtype=torch.float32) b = torch.zeros(1, dtype=torch.float32) w.requires_grad_(requires_grad=True) b.requires_grad_(requires_grad=True)
def linreg(X, w, b): return torch.mm(X, w) + b
def squared_loss(y_hat, y): return (y_hat - y.view(y_hat.size())) ** 2 / 2
def sgd(params, lr, batch_size): for param in params: param.data -= lr * param.grad / batch_size
# super parameters init lr = 0.03 num_epochs = 5 net = linreg loss = squared_loss # training for epoch in range(num_epochs): # training repeats num_epochs times # in each epoch, all the samples in dataset will be used once # X is the feature and y is the label of a batch sample for X, y in data_iter(batch_size, features, labels): l = loss(net(X, w, b), y).sum() # calculate the gradient of batch sample loss l.backward() # using small batch random gradient descent to iter model parameters sgd([w, b], lr, batch_size) # reset parameter gradient w.grad.data.zero_() b.grad.data.zero_() train_l = loss(net(features, w, b), labels) print('epoch %d, loss %f' % (epoch + 1, train_l.mean().item())) 输出:epoch 1, loss 7.605014 epoch 2, loss 7.521966 epoch 3, loss 7.550967 epoch 4, loss 7.542496 epoch 5, loss 7.535208
import torch from torch import nn import numpy as np torch.manual_seed(1) torch.set_default_tensor_type('torch.FloatTensor')
num_examples = 1000 true_w = [2, -3.4] true_b = 4.2 features = torch.tensor(np.random.normal(0, 1, (num_examples, num_inputs)), dtype=torch.float) labels = true_w[0] * features[:, 0] + true_w[1] * features[:, 1] + true_b labels += torch.tensor(np.random.normal(0, 0.01, size=labels.size()), dtype=torch.float)
import torch.utils.data as Data batch_size = 10 # combine featues and labels of dataset dataset = Data.TensorDataset(features, labels) # put dataset into DataLoader data_iter = Data.DataLoader( dataset=dataset, # torch TensorDataset format batch_size=batch_size, # mini batch size shuffle=True, # whether shuffle the data or not num_workers=2, # read data in multithreading ) for X, y in data_iter: print(X, '\n', y) break 输出:tensor([[ 0.5584, -0.4995], [-0.1495, -1.6520], [-0.3280, 0.2594], [-0.4857, -1.2976], [ 1.8603, 0.4539], [-0.3628, 0.0064], [ 1.3235, -0.3536], [-2.3426, -0.5968], [-0.6290, -0.2948], [-0.0787, 0.2180]]) tensor([7.0088, 9.5071, 2.6718, 7.6535, 6.3802, 3.4601, 8.0475, 1.5223, 3.9682, 3.2977])
class LinearNet(nn.Module): def __init__(self, n_feature): super(LinearNet, self).__init__() # call father function to init self.linear = nn.Linear(n_feature, 1) # function prototype: `torch.nn.Linear(in_features, out_features, bias=True)` def forward(self, x): y = self.linear(x) return y net = LinearNet(num_inputs) # ways to init a multilayer network # method one net = nn.Sequential( nn.Linear(num_inputs, 1) # other layers can be added here ) # method two net = nn.Sequential() net.add_module('linear', nn.Linear(num_inputs, 1)) # net.add_module ...... # method three from collections import OrderedDict net = nn.Sequential(OrderedDict([ ('linear', nn.Linear(num_inputs, 1)) # ...... ]))
from torch.nn import init init.normal_(net[0].weight, mean=0.0, std=0.01) init.constant_(net[0].bias, val=0.0) # or you can use `net[0].bias.data.fill_(0)` to modify it directly 输出:Parameter containing: tensor([0.], requires_grad=True) for param in net.parameters(): print(param) 输出:Parameter containing: tensor([[-0.0142, -0.0161]], requires_grad=True) Parameter containing: tensor([0.], requires_grad=True)
loss = nn.MSELoss() # nn built-in squared loss function # function prototype: `torch.nn.MSELoss(size_average=None, reduce=None, reduction='mean')`
import torch.optim as optim optimizer = optim.SGD(net.parameters(), lr=0.03) # built-in random gradient descent function print(optimizer) # function prototype: `torch.optim.SGD(params, lr=, momentum=0, dampening=0, weight_decay=0, nesterov=False)` 输出:SGD ( Parameter Group 0 dampening: 0 lr: 0.03 momentum: 0 nesterov: False weight_decay: 0 )
num_epochs = 3 for epoch in range(1, num_epochs + 1): for X, y in data_iter: output = net(X) l = loss(output, y.view(-1, 1)) optimizer.zero_grad() # reset gradient, equal to net.zero_grad() l.backward() optimizer.step() print('epoch %d, loss: %f' % (epoch, l.item())) # result comparision dense = net[0] print(true_w, dense.weight.data) print(true_b, dense.bias.data) 输出:epoch 1, loss: 0.000103 epoch 2, loss: 0.000097 epoch 3, loss: 0.000079