【Python机器学习】实验11 支持向量机2-阿里云开发者社区

1.4 可视化分析

#绘制图片
plt.figure(figsize=(12,4))
plt.subplot(1,2,1)
plt.scatter(data1["X1"],data1["X2"],marker="s",c=data1["SV1 decision function"],cmap='seismic')
plt.title("SVC1")
plt.subplot(1,2,2)
plt.scatter(data1["X1"],data1["X2"],marker="x",c=data1["SV2 decision function"],cmap='seismic')
plt.title("SVC2")
plt.show()

实例2 核支持向量机

现在我们将从线性SVM转移到能够使用内核进行非线性分类的SVM。我们首先负责实现一个高斯核函数。虽然scikit-learn具有内置的高斯内核，但为了实现更清楚，我们将从头开始实现。

2.1 读取数据集

data2 = pd.read_csv('data/svmdata2.csv')
data2

	X1	X2	y
0	0.107143	0.603070	1
1	0.093318	0.649854	1
2	0.097926	0.705409	1
3	0.155530	0.784357	1

4	0.210829	0.866228	1
...	...	...	...
858	0.994240	0.516667	1
859	0.964286	0.472807	1

860	0.975806	0.439474	1
861	0.989631	0.425439	1
862	0.996544	0.414912	1

863 rows × 3 columns

#可视化数据点
positive = data2[data2['y'].isin([1])]
negative = data2[data2['y'].isin([0])]
fig, ax = plt.subplots(figsize=(6,4))
ax.scatter(positive['X1'], positive['X2'], s=50, marker='x', label='Positive')
ax.scatter(negative['X1'], negative['X2'], s=50, marker='o', label='Negative')
ax.legend()
plt.show()

2.2 定义高斯核函数

def gaussian(x1,x2,sigma):
    return np.exp(-np.sum((x1-x2)**2)/(2*(sigma**2)))

x1=np.arange(1,5)
x2=np.arange(6,10)
gaussian(x1,x2,2)

3.726653172078671e-06

x1 = np.array([1.0, 2.0, 1.0])
x2 = np.array([0.0, 4.0, -1.0])
sigma = 2
gaussian(x1,x2,2)

0.32465246735834974

X2_train=data2[["X1","X2"]].values
y2_train=data2["y"].values
X2_train,y2_train

(array([[0.107143 , 0.60307  ],
        [0.093318 , 0.649854 ],
        [0.0979263, 0.705409 ],
        ...,
        [0.975806 , 0.439474 ],
        [0.989631 , 0.425439 ],
        [0.996544 , 0.414912 ]]),
 array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1], dtype=int64))

该结果与练习中的预期值相符。接下来，我们将检查另一个数据集，这次用非线性决策边界。

对于该数据集，我们将使用内置的RBF内核构建支持向量机分类器，并检查其对训练数据的准确性。为了可视化决策边界，这一次我们将根据实例具有负类标签的预测概率来对点做阴影。从结果可以看出，它们大部分是正确的。

2.3 创建非线性的支持向量机

import sklearn.svm as svm
nl_svc=svm.SVC(C=100,gamma=10,probability=True)
nl_svc.fit(X2_train,y2_train)

SVC(C=100, gamma=10, probability=True)

nl_svc.score(X2_train,y2_train)

0.9698725376593279

2.4 可视化样本类别

#将样本属于正类的概率作为颜色来对两类样本进行可视化输出
plt.figure(figsize=(12,4))
plt.subplot(1,2,1)
positive = data2[data2['y'].isin([1])]
negative = data2[data2['y'].isin([0])]
plt.scatter(positive['X1'], positive['X2'], s=50, marker='x', label='Positive')
plt.scatter(negative['X1'], negative['X2'], s=50, marker='o', label='Negative')
plt.legend()
plt.subplot(1,2,2)
data2["probability"]=nl_svc.predict_proba(data2[["X1","X2"]])[:,1]
plt.scatter(data2["X1"],data2["X2"],s=30,c=data2["probability"],cmap="Reds")
plt.show()

对于第三个数据集，我们给出了训练和验证集，并且基于验证集性能为SVM模型找到最优超参数。虽然我们可以使用scikit-learn的内置网格搜索来做到这一点，但是本着遵循练习的目的，我们将从头开始实现一个简单的网格搜索。

实例3 如何选择最优的C和gamma

3.1 读取数据

#读取文件，获取数据集
data3=pd.read_csv('data/svmdata3.csv')
#读取文件，获取验证集
data3val=pd.read_csv('data/svmdata3val.csv')

data3

	X1	X2	y
0	-0.158986	0.423977	1
1	-0.347926	0.470760	1
2	-0.504608	0.353801	1
3	-0.596774	0.114035	1
4	-0.518433	-0.172515	1

...	...	...	...
206	-0.399885	-0.621930	1
207	-0.124078	-0.126608	1
208	-0.316935	-0.228947	1
209	-0.294124	-0.134795	0
210	-0.153111	0.184503	0

211 rows × 3 columns

data3val

	X1	X2	yval	y
0	-0.353062	-0.673902	0	0
1	-0.227126	0.447320	1	1
2	0.092898	-0.753524	0	0
3	0.148243	-0.718473	0	0
4	-0.001512	0.162928	0	0
...	...	...	...	...

195	0.005203	-0.544449	1	1
196	0.176352	-0.572454	0	0
197	0.127651	-0.340938	0	0
198	0.248682	-0.497502	0	0
199	-0.316899	-0.429413	0	0

200 rows × 4 columns

X = data3[['X1','X2']].values
Xval = data3val[['X1','X2']].values
y = data3['y'].values
yval = data3val['yval'].values

3.2 利用数据集中的验证集做模型选择

C_values = [0.01, 0.03, 0.1, 0.3, 1, 3, 10, 30, 100]
gamma_values = [0.01, 0.03, 0.1, 0.3, 1, 3, 10, 30, 100]
best_score = 0
best_params = {'C': None, 'gamma': None}
for C in C_values:
    for gamma in gamma_values:
        svc = svm.SVC(C=C, gamma=gamma)
        svc.fit(X, y)
        score = svc.score(Xval, yval)
        if score > best_score:
            best_score = score
            best_params['C'] = C
            best_params['gamma'] = gamma
best_score, best_params

(0.965, {'C': 0.3, 'gamma': 100})

from sklearn import svm, datasets
from sklearn.model_selection import GridSearchCV
parameters = {'gamma':[0.01, 0.03, 0.1, 0.3, 1, 3, 10, 30, 100], 'C': [0.01, 0.03, 0.1, 0.3, 1, 3, 10, 30, 100]}
svc = svm.SVC()
clf = GridSearchCV(svc, parameters)
clf.fit(X, y)
# sorted(clf.cv_results_.keys())
max_index=np.argmax(clf.cv_results_['mean_test_score'])

clf.cv_results_["params"][max_index]

{'C': 30, 'gamma': 3}

实例4 基于鸢尾花数据集的决策边界绘制

4.1 读取鸢尾花数据集（特征选择花萼长度和花萼宽度）

from sklearn.svm import SVC
from sklearn import datasets
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)
iris = datasets.load_iris()
X = iris["data"][:, (2, 3)]  # petal length, petal width
y = iris["target"]
setosa_or_versicolor = (y == 0) | (y == 1)
X = X[setosa_or_versicolor]
y = y[setosa_or_versicolor]
# SVM Classifier model
svm_clf = SVC(kernel="linear", C=5)
svm_clf.fit(X, y)

SVC(C=5, kernel='linear')

np.max(X[:,0])

5.1

4.2 随机绘制几条决策边界可视化

# Bad models
x0 = np.linspace(0, 5.5, 200)
pred_1 = 5 * x0 - 20
pred_2 = x0 - 1.8
pred_3 = 0.1 * x0 + 0.5

#基于随机绘制的决策边界来叠加图
plt.figure(figsize=(6,4))
plt.plot(x0, pred_1, "g--", linewidth=2)
plt.plot(x0, pred_2, "r--", linewidth=2)
plt.plot(x0, pred_3, "b--", linewidth=2)
plt.scatter(X[:,0][y==0],X[:,1][y==0],marker="s")
plt.scatter(X[:,0][y==1],X[:,1][y==1],marker="*")
plt.axis([0, 5.5, 0, 2])
plt.show()
plt.show()

4.3 随机绘制几条决策边界可视化

svm_clf.coef_[0]

array([1.29411744, 0.82352928])

svm_clf.intercept_[0]

-3.7882347112962464

svm_clf.support_vectors_

array([[1.9, 0.4],
       [3. , 1.1]])

np.max(X[:,0]),np.min(X[:,0])

(5.1, 1.0)

4.4 最大间隔决策边界可视化

def plot_svc_decision_boundary(svm_clf, xmin, xmax):
    w = svm_clf.coef_[0]
    b = svm_clf.intercept_[0]
    # At the decision boundary, w0*x0 + w1*x1 + b = 0
    # => x1 = -w0/w1 * x0 - b/w1
    x0 = np.linspace(xmin, xmax, 200)
    decision_boundary = -w[0]/w[1] * x0 - b/w[1]
    # margin = 1/np.sqrt(w[1]**2+w[0]**2)
    margin = 1/0.9
    margin = 1/w[1]
    gutter_up = decision_boundary + margin
    gutter_down = decision_boundary - margin
    svs = svm_clf.support_vectors_
    plt.scatter(svs[:, 0], svs[:, 1], s=180, facecolors='#FFAAAA')
    plt.plot(x0, decision_boundary, "k-", linewidth=2)
    plt.plot(x0, gutter_up, "k--", linewidth=2)
    plt.plot(x0, gutter_down, "k--", linewidth=2)

plt.figure(figsize=(6,4))
plot_svc_decision_boundary(svm_clf, 0, 5.5)
plt.plot(X[:, 0][y == 1], X[:, 1][y == 1], "bs")
plt.plot(X[:, 0][y == 0], X[:, 1][y == 0], "yo")
plt.xlabel("Petal length", fontsize=14)
plt.axis([0, 5.5, 0, 2])
plt.show()

实例5 特征是否应该进行标准化？

5.1 原始特征的决策边界可视化

#准备数据
Xs = np.array([[1, 50], [5, 20], [3, 80], [5, 60]]).astype(np.float64)
ys = np.array([0, 0, 1, 1])
#实例化模型
svm_clf = SVC(kernel="linear", C=100)
svm_clf.fit(Xs, ys)
#绘制图形
plt.figure(figsize=(6,4))
plt.plot(Xs[:, 0][ys == 1], Xs[:, 1][ys == 1], "bo")
plt.plot(Xs[:, 0][ys == 0], Xs[:, 1][ys == 0], "ms")
plot_svc_decision_boundary(svm_clf, 0, 6)
plt.xlabel("$x_0$", fontsize=20)
plt.ylabel("$x_1$  ", fontsize=20, rotation=0)
plt.title("Unscaled", fontsize=16)
plt.axis([0, 6, 0, 90])

(0.0, 6.0, 0.0, 90.0)

5.1 标准化特征的决策边界可视化

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(Xs)
svm_clf.fit(X_scaled, ys)
plt.plot(X_scaled[:, 0][ys == 1], X_scaled[:, 1][ys == 1], "bo")
plt.plot(X_scaled[:, 0][ys == 0], X_scaled[:, 1][ys == 0], "ms")
plot_svc_decision_boundary(svm_clf, -2, 2)
plt.xlabel("$x_0$", fontsize=20)
plt.title("Scaled", fontsize=16)
plt.axis([-2, 2, -2, 2])
plt.show()

实例6 回到鸢尾花数据集

#回到鸢尾花数据集
X = iris["data"][:, (2, 3)]  # petal length, petal width
y = iris["target"]

X_outliers = np.array([[3.4, 1.3], [3.2, 0.8]])
y_outliers = np.array([0, 0])
Xo1 = np.concatenate([X, X_outliers[:1]], axis=0)
yo1 = np.concatenate([y, y_outliers[:1]], axis=0)
Xo2 = np.concatenate([X, X_outliers[1:]], axis=0)
yo2 = np.concatenate([y, y_outliers[1:]], axis=0)
svm_clf1= SVC(kernel="linear", C=10**9)
svm_clf1.fit(Xo1, yo1)
plt.figure(figsize=(12, 4))
plt.subplot(121)
plt.plot(Xo1[:, 0][yo1 == 1], Xo1[:, 1][yo1 == 1], "bs")
plt.plot(Xo1[:, 0][yo1 == 0], Xo1[:, 1][yo1 == 0], "yo")
plt.text(0.3, 1.0, "Impossible!", fontsize=24, color="red")
plot_svc_decision_boundary(svm_clf1, 0, 5.5)
plt.xlabel("Petal length", fontsize=14)
plt.ylabel("Petal width", fontsize=14)
plt.annotate(
    "Outlier",
    xy=(X_outliers[0][0], X_outliers[0][1]),
    xytext=(2.5, 1.7),
    ha="center",
    arrowprops=dict(facecolor='black', shrink=0.1),
    fontsize=16,
)
plt.axis([0, 5.5, 0, 2])
svm_clf2 = SVC(kernel="linear", C=10**9)
svm_clf2.fit(Xo2, yo2)
plt.subplot(122)
plt.plot(Xo2[:, 0][yo2 == 1], Xo2[:, 1][yo2 == 1], "bs")
plt.plot(Xo2[:, 0][yo2 == 0], Xo2[:, 1][yo2 == 0], "yo")
plot_svc_decision_boundary(svm_clf2, 0, 5.5)
plt.xlabel("Petal length", fontsize=14)
plt.annotate(
    "Outlier",
    xy=(X_outliers[1][0], X_outliers[1][1]),
    xytext=(3.2, 0.08),
    ha="center",
    arrowprops=dict(facecolor='black', shrink=0.1),
    fontsize=16,
)
plt.axis([0, 5.5, 0, 2])
plt.show()
plt.show()

实例7 非线性可分的决策边界

7.1 做一个新的数据

from sklearn.pipeline import Pipeline

from sklearn.datasets import make_moons
X, y = make_moons(n_samples=100, noise=0.15, random_state=42)

np.min(X[:,0]),np.max(X[:,0])

(-1.2720155884887554, 2.4093807207967215)

np.min(X[:,1]),np.max(X[:,1])

(-0.6491427462708279, 1.2711135917248466)

x0s = np.linspace(2, 15, 2)
x1s = np.linspace(3,12,2)
x0, x1 = np.meshgrid(x0s, x1s)
x0s ,x1s ,x0, x1

(array([ 2., 15.]),
 array([ 3., 12.]),
 array([[ 2., 15.],
        [ 2., 15.]]),
 array([[ 3.,  3.],
        [12., 12.]]))

x1.ravel()

array([ 3.,  3., 12., 12.])

x0.ravel()

array([ 2., 15.,  2., 15.])

X = np.c_[x0.ravel(), x1.ravel()]
X.shape,X

((4, 2),
 array([[ 2.,  3.],
        [15.,  3.],
        [ 2., 12.],
        [15., 12.]]))

y_pred=np.array([[1,0],[0,1]])

np.meshgrid(x0s, x1s)

[array([[ 2., 15.],
        [ 2., 15.]]),
 array([[ 3.,  3.],
        [12., 12.]])]

X = np.c_[x0.ravel(), x1.ravel()]
X.shape,x0.shape

((4, 2), (2, 2))

x0

array([[ 2., 15.],
       [ 2., 15.]])

【Python机器学习】实验11 支持向量机2