1. 基础概念

1.1 学习曲线

通过学习曲线可以观测模型准确度与训练数据集大小的关系，其要表达的内容是当训练数据集增加时，模型对训练数据集拟合的准确性以及交叉验证数据集预测的准确性的变化规律

%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np

n_dots = 200
X = np.linspace(0, 1, n_dots)                   
y = np.sqrt(X) + 0.2*np.random.rand(n_dots) - 0.1;
X = X.reshape(-1, 1)
y = y.reshape(-1, 1)
X.shape, y.shape

((200, 1), (200, 1))

# plot the data
fig = plt.figure()
ax = fig.add_subplot(1, 1, 1)
ax.plot(X, y, color='tab:blue')

[<matplotlib.lines.Line2D at 0x1e0908c8988>]

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
# degree 表示多项式的阶数
def polynomial_model(degree=1):
    polynomial_features = PolynomialFeatures(degree=degree,
                                             include_bias=False)
    linear_regression = LinearRegression()
    pipeline = Pipeline([("polynomial_features", polynomial_features),
                         ("linear_regression", linear_regression)])
    return pipeline

from sklearn.model_selection import learning_curve
from sklearn.model_selection import ShuffleSplit
# 这里的train_sizes是指定训练样本数量的变化规则，np.linspace(.1, 1.0, 5)表示把训练样本数量从0.1~1分成五等分
def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None,
                        n_jobs=1, train_sizes=np.linspace(.1, 1.0, 5)):
    """
    Generate a simple plot of the test and training learning curve.
    Parameters
    ----------
    estimator : object type that implements the "fit" and "predict" methods
        An object of that type which is cloned for each validation.
    title : string
        Title for the chart.
    X : array-like, shape (n_samples, n_features)
        Training vector, where n_samples is the number of samples and
        n_features is the number of features.
    y : array-like, shape (n_samples) or (n_samples, n_features), optional
        Target relative to X for classification or regression;
        None for unsupervised learning.
    ylim : tuple, shape (ymin, ymax), optional
        Defines minimum and maximum yvalues plotted.
    cv : int, cross-validation generator or an iterable, optional
        Determines the cross-validation splitting strategy.
        Possible inputs for cv are:
          - None, to use the default 3-fold cross-validation,
          - integer, to specify the number of folds.
          - An object to be used as a cross-validation generator.
          - An iterable yielding train/test splits.
        For integer/None inputs, if ``y`` is binary or multiclass,
        :class:`StratifiedKFold` used. If the estimator is not a classifier
        or if ``y`` is neither binary nor multiclass, :class:`KFold` is used.
        Refer :ref:`User Guide <cross_validation>` for the various
        cross-validators that can be used here.
    n_jobs : integer, optional
        Number of jobs to run in parallel (default 1).
    """
    plt.title(title)
    if ylim is not None:
        plt.ylim(*ylim)
    plt.xlabel("Training examples")
    plt.ylabel("Score")
    # learning_curve函数自动把训练样本的数量按照规定逐渐增加，然后画出不同训练样本数量时模型的准确性
    train_sizes, train_scores, test_scores = learning_curve(
        estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes)
    print("train_sizes:{}, train_scores:{}, test_scores:{}\n\t".format(train_sizes.shape, train_scores.shape, test_scores.shape))
#     print("train_sizes:{}, train_scores:{}, test_scores:{}\n\t".format(train_sizes, train_scores, test_scores))
    # 计算均值与方差
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    plt.grid()
    # plt.fill_between可以将模型准确性的平均值上下方差的空间里用颜色填充
#     plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
#                      train_scores_mean + train_scores_std, alpha=0.1,
#                      color="r")
#     plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
#                      test_scores_mean + test_scores_std, alpha=0.1, color="g")
    # 画出不同样本数量时对于的准确率
    plt.plot(train_sizes, train_scores_mean, 'o--', color="r",
             label="Training score")
    plt.plot(train_sizes, test_scores_mean, 'o-', color="g",
             label="Cross-validation score")
    # 调整说明框的参数
    plt.legend(loc="best")
    return plt
# 为了让学习曲线更平滑，交叉验证数据集的得分计算 10 次，每次都重新选中 20% 的数据计算一遍
cv = ShuffleSplit(n_splits=10, test_size=0.2, random_state=0)
titles = ['Learning Curves (Under Fitting)',
          'Learning Curves',
          'Learning Curves (Over Fitting)']
degrees = [1, 3, 10]
plt.figure(figsize=(18, 4))
for i in range(len(degrees)):
    plt.subplot(1, 3, i + 1)
    plot_learning_curve(polynomial_model(degrees[i]), titles[i], X, y, ylim=(0.75, 1.01), cv=cv)
plt.show()

train_sizes:(5,), train_scores:(5, 10), test_scores:(5, 10)
train_sizes:(5,), train_scores:(5, 10), test_scores:(5, 10)
train_sizes:(5,), train_scores:(5, 10), test_scores:(5, 10)

1.2 L1/L2 范数

def L1(x):
    return 1 - np.abs(x)
def L2(x):
    return np.sqrt(1 - np.power(x, 2))
def contour(v, x):
    return 5 - np.sqrt(v - np.power(x + 2, 2))    # 4x1^2 + 9x2^2 = v
def format_spines(title):    
    ax = plt.gca()                                  # gca 代表当前坐标轴，即 'get current axis'
    ax.spines['right'].set_color('none')            # 隐藏坐标轴
    ax.spines['top'].set_color('none')
    ax.xaxis.set_ticks_position('bottom')           # 设置刻度显示位置
    ax.spines['bottom'].set_position(('data',0))    # 设置下方坐标轴位置
    ax.yaxis.set_ticks_position('left')
    ax.spines['left'].set_position(('data',0))      # 设置左侧坐标轴位置
    plt.title(title)
    plt.xlim(-4, 4)
    plt.ylim(-4, 4)
plt.figure(figsize=(8.4, 4), dpi=100)
x = np.linspace(-1, 1, 100)
cx = np.linspace(-3, 1, 100)
plt.subplot(1, 2, 1)
format_spines('L1 norm')
plt.plot(x, L1(x), 'r-', x, -L1(x), 'r-')
plt.plot(cx, contour(20, cx), 'r--', cx, contour(15, cx), 'r--', cx, contour(10, cx), 'r--')
plt.subplot(1, 2, 2)
format_spines('L2 norm')
plt.plot(x, L2(x), 'b-', x, -L2(x), 'b-')
plt.plot(cx, contour(19, cx), 'b--', cx, contour(15, cx), 'b--', cx, contour(10, cx), 'b--')

[<matplotlib.lines.Line2D at 0x18d6498f508>,
 <matplotlib.lines.Line2D at 0x18d64f374c8>,
 <matplotlib.lines.Line2D at 0x18d64f376c8>]

L1范数作为正则项，会让模型参数稀疏化，即让模型参数向量里的0元素尽可能多，只保留模型参数向量中重要特征的贡献。

而L2范数作为正则项，则让模型参数尽量小，但不会为0，即尽量让每个特征对应预测值都有一些小的贡献。

作为推论，L1范数作为正则项，有以下几个用途：

特征选择：它会让模型参数向量里的元素为0的点尽量多。因此可以排除掉那些对预测值没有什么影响的特征。从而简化问题。所以L1范数解决过拟合的措施，实际上是减少特征数量。
可解释性：模型参数向量稀疏化后，只会留下那些对预测值有重要影响的特征。这样我们就容易解释模型的因果关系。比如，针对某种癌症的筛查，如果有100个特征，那么我们无从解释到底哪些特征对阳性呈关键作用。稀疏化后，只留下几个关键的特征，就容易看到因果关系。

由此可见，L1范数作为正则项，更多的是一个分析工具，而适合用来对模型求解。因为它会把不重要的特征直接去除。大部分的情况下，我们解决过拟合问题，还是选择L2范数作为正则项

1.3 正则化

线性模型的正则化函数：

逻辑回归的正则化函数：

其方法也是在原本的损失函数上增加了一个正则化项

2. 线性回归

2.1 线性回归拟合正弦函数

%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np

n_dots = 200
# 随机添加一些噪声
X = np.linspace(-2 * np.pi, 2 * np.pi, n_dots)
Y = np.sin(X) + 0.2 * np.random.rand(n_dots) - 0.1
X = X.reshape(-1, 1)
Y = Y.reshape(-1, 1);

X.shape, Y.shape

((200, 1), (200, 1))

画出噪声曲线

# fig = plt.subplots(1,1,1)
# plt.plot(X,Y)
# plot the data
fig = plt.figure()
ax = fig.add_subplot(1, 1, 1)
ax.plot(X, Y, color='tab:blue'

[<matplotlib.lines.Line2D at 0x21c99aadd08>]

使用管道将两个类串起来

from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline
# degree决定多项式的阶数
def polynomial_model(degree=1):
    # 多项式实现
    polynomial_features = PolynomialFeatures(degree=degree,
                                             include_bias=False)
    # normalize表示进行特征缩放，对训练样本进行归一化处理，处理后的特征值在[0,1]之间
    linear_regression = LinearRegression(normalize=True)
    # 把回归模型与多项式模型串联起来
    pipeline = Pipeline([("polynomial_features", polynomial_features),
                         ("linear_regression", linear_regression)])
    return pipeline

分别使用2,3,5,10阶多项式来拟合数据集

from sklearn.metrics import mean_squared_error
degrees = [2, 3, 5, 10]
results = []
for d in degrees:
    # 构建模型：多项式+线性回归
    model = polynomial_model(degree=d)
    # 训练
    model.fit(X, Y)
    # 测试结果
    train_score = model.score(X, Y)
    mse = mean_squared_error(Y, model.predict(X))
    results.append({"model": model, "degree": d, "score": train_score, "mse": mse})
for r in results:
    print("degree: {}; train score: {}; mean squared error: {}".format(r["degree"], r["score"], r["mse"]))

degree: 2; train score: 0.1491389252670503; mean squared error: 0.4291288571778199
degree: 3; train score: 0.2754424056436836; mean squared error: 0.3654281311707956
degree: 5; train score: 0.8932535122798162; mean squared error: 0.05383722401155313
degree: 10; train score: 0.9936267162822405; mean squared error: 0.0032143437271831064

from matplotlib.figure import SubplotParams
# 调整图像框的大小，SubplotParams可以调整子图的竖直间距
plt.figure(figsize=(10, 5), dpi=100, subplotpars=SubplotParams(hspace=0.3))
for i, r in enumerate(results):
    fig = plt.subplot(2, 2, i+1)
    plt.xlim(-8, 8)
    plt.title("LinearRegression degree={}".format(r["degree"]))
    plt.scatter(X, Y, s=5, c='b', alpha=0.5)
    plt.plot(X, r["model"].predict(X), 'r-')

可以看见10阶的多项式拟合正选函数只能是在训练区间中表现得良好，而在其他的区间上效果不是很好，如下图所示

n_dots = 500
# x = np.linspace(-20, 20, n_dots)
x = np.linspace(-2.5 * np.pi, 2.5 * np.pi, n_dots)
x = x.reshape(-1,1)
y = results[-1]['model'].predict(x)
x.shape, y.shape

((500, 1), (500, 1))

fig = plt.subplot(1,1,1)
plt.xlim(-20, 20)
plt.plot(x,y)

[<matplotlib.lines.Line2D at 0x21c99b30bc8>]

整体查看效果，均可以看见，数据只能对部分有一个比较好的预测，也就是已经过拟合，所以可以通过减少阶层数来实现长区间内比较好的效果

n_dots = 500
# x = np.linspace(-20, 20, n_dots)
x = np.linspace(-2.5 * np.pi, 2.5 * np.pi, n_dots)
x = x.reshape(-1,1)
plt.figure(figsize=(8, 5), dpi=100, subplotpars=SubplotParams(hspace=0.3))
for i in range(4):
    y = results[i]['model'].predict(x)
    plt.subplot(2,2,i+1)
    plt.xlim(-20, 20)
    plt.plot(x,y)

2.2 线性回归预测房价

%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np

from sklearn.datasets import load_boston
boston = load_boston()
X = boston.data
y = boston.target
X.shape, y.shape

((506, 13), (506,))

查看样本特征

boston.feature_names

array(['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD',
       'TAX', 'PTRATIO', 'B', 'LSTAT'], dtype='<U7')

切分数据集，按8:2的格式进行切分

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=3)

import time
from sklearn.linear_model import LinearRegression
model = LinearRegression()
# 记录当前时间
start = time.clock()
# 使用训练集对模型进行训练
model.fit(X_train, y_train)
# 统计得分
train_score = model.score(X_train, y_train)
cv_score = model.score(X_test, y_test)
print('elaspe: {0:.6f}; train_score: {1:0.6f}; cv_score: {2:.6f}'.format(time.clock()-start, train_score, cv_score))

elaspe: 0.001749; train_score: 0.723941; cv_score: 0.795262
E:\anacanda\envs\pytorch\lib\site-packages\ipykernel_launcher.py:7: DeprecationWarning: time.clock has been deprecated in Python 3.3 and will be removed from Python 3.8: use time.perf_counter or time.process_time instead
  import sys
E:\anacanda\envs\pytorch\lib\site-packages\ipykernel_launcher.py:13: DeprecationWarning: time.clock has been deprecated in Python 3.3 and will be removed from Python 3.8: use time.perf_counter or time.process_time instead
  del sys.path[0]

在这个结果中看出训练集与测试集的分数都不算很高，感觉是有点欠拟合，线性回归模型太简单；此时可以挖掘更多特征或者是增加多项式特征

from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline
# 自设定多项式阶级
def polynomial_model(degree=1):
    # 多项式设置
    polynomial_features = PolynomialFeatures(degree=degree,
                                             include_bias=False)
    # normalize表示进行特征缩放，对训练样本进行归一化处理，处理后的特征值在[0,1]之间
    linear_regression = LinearRegression(normalize=True)
    # 把回归模型与多项式模型串联起来
    pipeline = Pipeline([("polynomial_features", polynomial_features),
                         ("linear_regression", linear_regression)])
    return pipeline
# 使用了2阶多项式拟合
model = polynomial_model(degree=2)
start = time.clock()
model.fit(X_train, y_train)
train_score = model.score(X_train, y_train)
cv_score = model.score(X_test, y_test)
print('elaspe: {0:.6f}; train_score: {1:0.6f}; cv_score: {2:.6f}'.format(time.clock()-start, train_score, cv_score))

E:\anacanda\envs\pytorch\lib\site-packages\ipykernel_launcher.py:20: DeprecationWarning: time.clock has been deprecated in Python 3.3 and will be removed from Python 3.8: use time.perf_counter or time.process_time instead
elaspe: 0.735317; train_score: 0.930547; cv_score: 0.860049
E:\anacanda\envs\pytorch\lib\site-packages\ipykernel_launcher.py:25: DeprecationWarning: time.clock has been deprecated in Python 3.3 and will be removed from Python 3.8: use time.perf_counter or time.process_time instead

画出学习曲线

from sklearn.model_selection import learning_curve
from sklearn.model_selection import ShuffleSplit
# 这里的train_sizes是指定训练样本数量的变化规则，np.linspace(.1, 1.0, 5)表示把训练样本数量从0.1~1分成五等分
def plot_learning_curve(plt, estimator, title, X, y, ylim=None, cv=None,
                        n_jobs=1, train_sizes=np.linspace(.1, 1.0, 5)):
    plt.title(title)
    if ylim is not None:
        plt.ylim(*ylim)
    plt.xlabel("Training examples")
    plt.ylabel("Score")
    train_sizes, train_scores, test_scores = learning_curve(
        estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes)
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    plt.grid()
    plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha=0.1,
                     color="r")
    plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std, alpha=0.1, color="g")
    plt.plot(train_sizes, train_scores_mean, 'o--', color="r",
             label="Training score")
    plt.plot(train_sizes, test_scores_mean, 'o-', color="g",
             label="Cross-validation score")
    plt.legend(loc="best")
    return plt
cv = ShuffleSplit(n_splits=10, test_size=0.2, random_state=0)
plt.figure(figsize=(18, 4))
title = 'Learning Curves (degree={0})'
degrees = [1, 2, 3]
start = time.clock()
plt.figure(figsize=(18, 4), dpi=200)
for i in range(len(degrees)):
    plt.subplot(1, 3, i + 1)
    plot_learning_curve(plt, polynomial_model(degrees[i]), title.format(degrees[i]), X, y, ylim=(0.01, 1.01), cv=cv)
print('elaspe: {0:.6f}'.format(time.clock()-start))

E:\anacanda\envs\pytorch\lib\site-packages\ipykernel_launcher.py:79: DeprecationWarning: time.clock has been deprecated in Python 3.3 and will be removed from Python 3.8: use time.perf_counter or time.process_time instead
elaspe: 6.394253
E:\anacanda\envs\pytorch\lib\site-packages\ipykernel_launcher.py:85: DeprecationWarning: time.clock has been deprecated in Python 3.3 and will be removed from Python 3.8: use time.perf_counter or time.process_time instead
<Figure size 1296x288 with 0 Axes>

二阶多项式的效果对比：

import pandas as pd
result = pd.DataFrame()
pred = model.predict(X_test)[:10]
true = y_test[:10]
result['pred'] = pred
result['true'] = true
result['error'] = pred - true
result.head(10)

批量梯度下降算法（Batch Gradient Descent）

对参数进行一次迭代运算，需要遍历所有的训练数据集。当训练数据集比较大时，其算法效率会比较低

随机梯度下降算法（Stochastic Gradient Descent）

关键是将累加器去掉，不需要遍历所有的训练数据集，而是改成每次随机地从训练数据集中取一个数据进行参数的迭代计算，随机梯度下降算法可以大大提高模型的训练效率。

思考：为什么随机取一个样本进行参数迭代是可行的？

这里的累加后除以2是为了计算方便，而除以m的意思是平均值，既所有训练数据集上的点到预测函数的距离的平均值。而随机梯度下降算法中，随机地选取数据集里的一个数据，在这个做法中，如果计算次数足够多，并且是真正随机，那么随机出来的这组数据从概率的角度来看，和平均值是相当的。

打个比方，储钱罐里有1角的硬币10枚，5角的硬币2枚，1元的硬币1枚，总计3元、13枚硬币。随机从里面取1000次，把每次取出来的硬币币值记录下来，然后将硬币放回储钱罐里。这样最后去算这1000次取出来的钱的平均值（1000次取出来的币值总和除以1000）和储钱罐里每枚硬币的平均值（3/13元）应该是近似相等的。

3. 逻辑回归

3.1 逻辑回归模型成本函数

def f_1(x):
    return -np.log(x)
def f_0(x):
    return -np.log(1 - x)
X = np.linspace(0.01, 0.99, 100)
f = [f_1, f_0]
titles = ["y=1: $-log(h_\\theta(x))$", "y=0: $-log(1 - h_\\theta(x))$"]
plt.figure(figsize=(12, 4))
for i in range(len(f)):
    plt.subplot(1, 2, i + 1)
    plt.title(titles[i])
    plt.xlabel("$h_\\theta(x)$")
    plt.ylabel("$Cost(h_\\theta(x), y)$")
    plt.plot(X, f[i](X), 'r-')

简单来说，二分类问题，正样本的判断值越远离0，其损失越小；而对于负样本的判断值越远离0，其损失越大

3.2 逻辑回归癌症预测实战

%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np

# 载入数据
from sklearn.datasets import load_breast_cancer
cancer = load_breast_cancer()
X = cancer.data
y = cancer.target
print('X.shape:{0}, y.shape:{1}; no. positive: {2}; no. negative: {3}'.format(
    X.shape, y.shape, y[y==1].shape[0], y[y==0].shape[0]))

X.shape:(569, 30), y.shape:(569,); no. positive: 357; no. negative: 212

cancer.feature_names

array(['mean radius', 'mean texture', 'mean perimeter', 'mean area',
       'mean smoothness', 'mean compactness', 'mean concavity',
       'mean concave points', 'mean symmetry', 'mean fractal dimension',
       'radius error', 'texture error', 'perimeter error', 'area error',
       'smoothness error', 'compactness error', 'concavity error',
       'concave points error', 'symmetry error',
       'fractal dimension error', 'worst radius', 'worst texture',
       'worst perimeter', 'worst area', 'worst smoothness',
       'worst compactness', 'worst concavity', 'worst concave points',
       'worst symmetry', 'worst fractal dimension'], dtype='<U23')

训练集与验证集的划分

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

这里使用逻辑回归模型来训练

# 模型训练
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(solver='liblinear')
model.fit(X_train, y_train)
train_score = model.score(X_train, y_train)
test_score = model.score(X_test, y_test)
print('train score: {train_score:.6f}; test score: {test_score:.6f}'.format(
    train_score=train_score, test_score=test_score))

train score: 0.953846; test score: 0.964912

在分类问题上，sklearn中score得输出值是正确预测的百分比

# 样本预测
y_pred = model.predict(X_test)
print('matchs: {0}/{1}={2}'.format(np.equal(y_pred, y_test).sum(), y_test.shape[0], np.equal(y_pred, y_test).sum()/y_test.shape[0]))

matchs: 110/114=0.9649122807017544

找出自信度不足90%的样本

# 预测概率：找出低于 90% 概率的样本个数
y_pred_proba = model.predict_proba(X_test)
print('sample of predict probability: {0}, y_pred_proba.shape:{1}'.format(y_pred_proba[0], y_pred_proba.shape))
# 找出第一列，即预测为阴性的概率大于 0.1 的样本，保存在 result 里
y_pred_proba_0 = y_pred_proba[:, 0] > 0.1 
result = y_pred_proba[y_pred_proba_0]
# 在 result 结果集里，找出第二列，即预测为阳性的概率大于 0.1 的样本
y_pred_proba_1 = result[:, 1] > 0.1
print("y_pred_proba_1.shape:{}".format(y_pred_proba_1.shape))

sample of predict probability: [9.99999929e-01 7.14244099e-08], y_pred_proba.shape:(114, 2)
y_pred_proba_1.shape:(56,)

这里使用了L1正则化处理的二阶多项式进行逻辑回归

import time
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline
# 增加多项式预处理
def polynomial_model(degree=1, **kwarg):
    polynomial_features = PolynomialFeatures(degree=degree,
                                             include_bias=False)
    logistic_regression = LogisticRegression(**kwarg)
    pipeline = Pipeline([("polynomial_features", polynomial_features),
                         ("logistic_regression", logistic_regression)])
    return pipeline
# 增加二阶多项式特征创建模型
model = polynomial_model(degree=2, penalty='l1', solver='liblinear')
start = time.clock()
model.fit(X_train, y_train)
train_score = model.score(X_train, y_train)
cv_score = model.score(X_test, y_test)
print('elaspe: {0:.6f}; train_score: {1:0.6f}; cv_score: {2:.6f}'.format(
    time.clock()-start, train_score, cv_score))

elaspe: 0.570833; train_score: 1.000000; cv_score: 0.956140

L1范数作为正则项，可以实现参数的稀疏化，即自动帮助我们选择出那些对模型有关联的特征。可以观察一下有多少个特征没有被丢弃，即其对应的模型参数θ j 非0

logistic_regression = model.named_steps['logistic_regression']
print('model parameters shape: {0}; count of non-zero element: {1}'.format(
    logistic_regression.coef_.shape, 
    np.count_nonzero(logistic_regression.coef_)))

model parameters shape: (1, 495); count of non-zero element: 93

学习曲线

怎么知道使用L1范数作为正则项能提高算法的准确性？

首先画出使用L1范数作为正则项所对应的一阶和二阶多项式的学习曲线：

from utils import plot_learning_curve
from sklearn.model_selection import ShuffleSplit
# 为了让学习曲线平滑，计算10次交叉验证数据集的分数
cv = ShuffleSplit(n_splits=10, test_size=0.2, random_state=0)
title = 'Learning Curves (degree={0}, penalty={1})'
degrees = [1, 2]
penalty = 'l1'
start = time.clock()
plt.figure(figsize=(12, 4), dpi=144)
for i in range(len(degrees)):
    plt.subplot(1, len(degrees), i + 1)
    plot_learning_curve(plt, polynomial_model(degree=degrees[i], penalty=penalty, solver='liblinear', max_iter=300), 
                        title.format(degrees[i], penalty), X, y, ylim=(0.8, 1.01), cv=cv)
print('elaspe: {0:.6f}'.format(time.clock()-start))

E:\anacanda\envs\pytorch\lib\site-packages\ipykernel_launcher.py:9: DeprecationWarning: time.clock has been deprecated in Python 3.3 and will be removed from Python 3.8: use time.perf_counter or time.process_time instead
  if __name__ == '__main__':
E:\anacanda\envs\pytorch\lib\site-packages\ipykernel_launcher.py:16: DeprecationWarning: time.clock has been deprecated in Python 3.3 and will be removed from Python 3.8: use time.perf_counter or time.process_time instead
  app.launch_new_instance()
elaspe: 21.228479

画出使用L2范数作为正则项所对应的一阶和二阶多项式的学习曲线：

import warnings
warnings.filterwarnings("ignore")
penalty = 'l2'
start = time.clock()
plt.figure(figsize=(12, 4), dpi=144)
for i in range(len(degrees)):
    plt.subplot(1, len(degrees), i + 1)
    plot_learning_curve(plt, polynomial_model(degree=degrees[i], penalty=penalty, solver='lbfgs'), 
                        title.format(degrees[i], penalty), X, y, ylim=(0.8, 1.01), cv=cv)
print('elaspe: {0:.6f}'.format(time.clock()-start))

elaspe: 8.343727

L1范数对应的学习曲线，需要花比较长的时间，原因是，scikit-learn的learning_curve()函数在画学习曲线的过程中，要对模型进行多次训练，并计算交叉验证样本评分。同时，为了使曲线更平滑，针对每个点还会进行多次计算求平均值。这个就是ShuffleSplit类的作用。

测试三阶多项式来拟合模型：

# 增加二阶多项式特征创建模型
model = polynomial_model(degree=3, penalty='l1', solver='liblinear')

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((455, 30), (114, 30), (455,), (114,))

model.fit(X_train, y_train)
testscore = model.score(X_test, y_test)
test_score

0.9649122807017544

logistic_regression2 = model.named_steps['logistic_regression']
print("use feature nums:{}\nall feature nums:{}".format(np.count_nonzero(logistic_regression2.coef_), logistic_regression2.coef_.shape[-1]))

use feature nums:1262
all feature nums:5455

可视化预测结果：

import pandas as pd
result = pd.DataFrame()
result['pred'] = model.predict(X_test)
result['true'] = y_test
result['parm'] = result['pred']==result['true']
result.head(10)

机器学习原理与实战 | 线性回归与逻辑回归

1. 基础概念

1.1 学习曲线

1.2 L1/L2 范数

1.3 正则化

2. 线性回归

2.1 线性回归拟合正弦函数

2.2 线性回归预测房价

3. 逻辑回归

3.1 逻辑回归模型成本函数

3.2 逻辑回归癌症预测实战

热门文章

最新文章

相关课程

相关电子书

相关实验场景