1.4 可视化分析
#绘制图片 plt.figure(figsize=(12,4)) plt.subplot(1,2,1) plt.scatter(data1["X1"],data1["X2"],marker="s",c=data1["SV1 decision function"],cmap='seismic') plt.title("SVC1") plt.subplot(1,2,2) plt.scatter(data1["X1"],data1["X2"],marker="x",c=data1["SV2 decision function"],cmap='seismic') plt.title("SVC2") plt.show()
实例2 核支持向量机
现在我们将从线性SVM转移到能够使用内核进行非线性分类的SVM。 我们首先负责实现一个高斯核函数。 虽然scikit-learn具有内置的高斯内核,但为了实现更清楚,我们将从头开始实现。
2.1 读取数据集
data2 = pd.read_csv('data/svmdata2.csv') data2
X1 | X2 | y | |
0 | 0.107143 | 0.603070 | 1 |
1 | 0.093318 | 0.649854 | 1 |
2 | 0.097926 | 0.705409 | 1 |
3 | 0.155530 | 0.784357 | 1 |
4 | 0.210829 | 0.866228 | 1 |
... | ... | ... | ... |
858 | 0.994240 | 0.516667 | 1 |
859 | 0.964286 | 0.472807 | 1 |
860 | 0.975806 | 0.439474 | 1 |
861 | 0.989631 | 0.425439 | 1 |
862 | 0.996544 | 0.414912 | 1 |
863 rows × 3 columns
#可视化数据点 positive = data2[data2['y'].isin([1])] negative = data2[data2['y'].isin([0])] fig, ax = plt.subplots(figsize=(6,4)) ax.scatter(positive['X1'], positive['X2'], s=50, marker='x', label='Positive') ax.scatter(negative['X1'], negative['X2'], s=50, marker='o', label='Negative') ax.legend() plt.show()
2.2 定义高斯核函数
def gaussian(x1,x2,sigma): return np.exp(-np.sum((x1-x2)**2)/(2*(sigma**2)))
x1=np.arange(1,5) x2=np.arange(6,10) gaussian(x1,x2,2)
3.726653172078671e-06
x1 = np.array([1.0, 2.0, 1.0]) x2 = np.array([0.0, 4.0, -1.0]) sigma = 2 gaussian(x1,x2,2)
0.32465246735834974
X2_train=data2[["X1","X2"]].values y2_train=data2["y"].values X2_train,y2_train
(array([[0.107143 , 0.60307 ], [0.093318 , 0.649854 ], [0.0979263, 0.705409 ], ..., [0.975806 , 0.439474 ], [0.989631 , 0.425439 ], [0.996544 , 0.414912 ]]), array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], dtype=int64))
该结果与练习中的预期值相符。 接下来,我们将检查另一个数据集,这次用非线性决策边界。
对于该数据集,我们将使用内置的RBF内核构建支持向量机分类器,并检查其对训练数据的准确性。 为了可视化决策边界,这一次我们将根据实例具有负类标签的预测概率来对点做阴影。 从结果可以看出,它们大部分是正确的。
2.3 创建非线性的支持向量机
import sklearn.svm as svm nl_svc=svm.SVC(C=100,gamma=10,probability=True) nl_svc.fit(X2_train,y2_train)
SVC(C=100, gamma=10, probability=True)
nl_svc.score(X2_train,y2_train)
0.9698725376593279
2.4 可视化样本类别
#将样本属于正类的概率作为颜色来对两类样本进行可视化输出 plt.figure(figsize=(12,4)) plt.subplot(1,2,1) positive = data2[data2['y'].isin([1])] negative = data2[data2['y'].isin([0])] plt.scatter(positive['X1'], positive['X2'], s=50, marker='x', label='Positive') plt.scatter(negative['X1'], negative['X2'], s=50, marker='o', label='Negative') plt.legend() plt.subplot(1,2,2) data2["probability"]=nl_svc.predict_proba(data2[["X1","X2"]])[:,1] plt.scatter(data2["X1"],data2["X2"],s=30,c=data2["probability"],cmap="Reds") plt.show()
对于第三个数据集,我们给出了训练和验证集,并且基于验证集性能为SVM模型找到最优超参数。 虽然我们可以使用scikit-learn的内置网格搜索来做到这一点,但是本着遵循练习的目的,我们将从头开始实现一个简单的网格搜索。
实例3 如何选择最优的C和gamma
3.1 读取数据
#读取文件,获取数据集 data3=pd.read_csv('data/svmdata3.csv') #读取文件,获取验证集 data3val=pd.read_csv('data/svmdata3val.csv')
data3
X1 |
X2 | y | |
0 | -0.158986 | 0.423977 | 1 |
1 | -0.347926 | 0.470760 | 1 |
2 | -0.504608 | 0.353801 | 1 |
3 | -0.596774 | 0.114035 | 1 |
4 | -0.518433 | -0.172515 | 1 |
... | ... | ... | ... |
206 | -0.399885 | -0.621930 | 1 |
207 | -0.124078 | -0.126608 | 1 |
208 | -0.316935 | -0.228947 | 1 |
209 | -0.294124 | -0.134795 | 0 |
210 | -0.153111 | 0.184503 | 0 |
211 rows × 3 columns
data3val
X1 |
X2 | yval | y | |
0 | -0.353062 | -0.673902 | 0 | 0 |
1 | -0.227126 | 0.447320 | 1 | 1 |
2 | 0.092898 | -0.753524 | 0 | 0 |
3 | 0.148243 | -0.718473 | 0 | 0 |
4 | -0.001512 | 0.162928 | 0 | 0 |
... | ... | ... | ... | ... |
195 | 0.005203 | -0.544449 | 1 | 1 |
196 | 0.176352 | -0.572454 | 0 | 0 |
197 | 0.127651 | -0.340938 | 0 | 0 |
198 | 0.248682 | -0.497502 | 0 | 0 |
199 | -0.316899 | -0.429413 | 0 | 0 |
200 rows × 4 columns
X = data3[['X1','X2']].values Xval = data3val[['X1','X2']].values y = data3['y'].values yval = data3val['yval'].values
3.2 利用数据集中的验证集做模型选择
C_values = [0.01, 0.03, 0.1, 0.3, 1, 3, 10, 30, 100] gamma_values = [0.01, 0.03, 0.1, 0.3, 1, 3, 10, 30, 100] best_score = 0 best_params = {'C': None, 'gamma': None} for C in C_values: for gamma in gamma_values: svc = svm.SVC(C=C, gamma=gamma) svc.fit(X, y) score = svc.score(Xval, yval) if score > best_score: best_score = score best_params['C'] = C best_params['gamma'] = gamma best_score, best_params
(0.965, {'C': 0.3, 'gamma': 100})
from sklearn import svm, datasets from sklearn.model_selection import GridSearchCV parameters = {'gamma':[0.01, 0.03, 0.1, 0.3, 1, 3, 10, 30, 100], 'C': [0.01, 0.03, 0.1, 0.3, 1, 3, 10, 30, 100]} svc = svm.SVC() clf = GridSearchCV(svc, parameters) clf.fit(X, y) # sorted(clf.cv_results_.keys()) max_index=np.argmax(clf.cv_results_['mean_test_score'])
clf.cv_results_["params"][max_index]
{'C': 30, 'gamma': 3}
实例4 基于鸢尾花数据集的决策边界绘制
4.1 读取鸢尾花数据集(特征选择花萼长度和花萼宽度)
from sklearn.svm import SVC from sklearn import datasets import matplotlib as mpl import matplotlib.pyplot as plt mpl.rc('axes', labelsize=14) mpl.rc('xtick', labelsize=12) mpl.rc('ytick', labelsize=12) iris = datasets.load_iris() X = iris["data"][:, (2, 3)] # petal length, petal width y = iris["target"] setosa_or_versicolor = (y == 0) | (y == 1) X = X[setosa_or_versicolor] y = y[setosa_or_versicolor] # SVM Classifier model svm_clf = SVC(kernel="linear", C=5) svm_clf.fit(X, y)
SVC(C=5, kernel='linear')
np.max(X[:,0])
5.1
4.2 随机绘制几条决策边界可视化
# Bad models x0 = np.linspace(0, 5.5, 200) pred_1 = 5 * x0 - 20 pred_2 = x0 - 1.8 pred_3 = 0.1 * x0 + 0.5
#基于随机绘制的决策边界来叠加图 plt.figure(figsize=(6,4)) plt.plot(x0, pred_1, "g--", linewidth=2) plt.plot(x0, pred_2, "r--", linewidth=2) plt.plot(x0, pred_3, "b--", linewidth=2) plt.scatter(X[:,0][y==0],X[:,1][y==0],marker="s") plt.scatter(X[:,0][y==1],X[:,1][y==1],marker="*") plt.axis([0, 5.5, 0, 2]) plt.show() plt.show()
4.3 随机绘制几条决策边界可视化
svm_clf.coef_[0]
array([1.29411744, 0.82352928])
svm_clf.intercept_[0]
-3.7882347112962464
svm_clf.support_vectors_
array([[1.9, 0.4], [3. , 1.1]])
np.max(X[:,0]),np.min(X[:,0])
(5.1, 1.0)
4.4 最大间隔决策边界可视化
def plot_svc_decision_boundary(svm_clf, xmin, xmax): w = svm_clf.coef_[0] b = svm_clf.intercept_[0] # At the decision boundary, w0*x0 + w1*x1 + b = 0 # => x1 = -w0/w1 * x0 - b/w1 x0 = np.linspace(xmin, xmax, 200) decision_boundary = -w[0]/w[1] * x0 - b/w[1] # margin = 1/np.sqrt(w[1]**2+w[0]**2) margin = 1/0.9 margin = 1/w[1] gutter_up = decision_boundary + margin gutter_down = decision_boundary - margin svs = svm_clf.support_vectors_ plt.scatter(svs[:, 0], svs[:, 1], s=180, facecolors='#FFAAAA') plt.plot(x0, decision_boundary, "k-", linewidth=2) plt.plot(x0, gutter_up, "k--", linewidth=2) plt.plot(x0, gutter_down, "k--", linewidth=2)
plt.figure(figsize=(6,4)) plot_svc_decision_boundary(svm_clf, 0, 5.5) plt.plot(X[:, 0][y == 1], X[:, 1][y == 1], "bs") plt.plot(X[:, 0][y == 0], X[:, 1][y == 0], "yo") plt.xlabel("Petal length", fontsize=14) plt.axis([0, 5.5, 0, 2]) plt.show()
实例5 特征是否应该进行标准化?
5.1 原始特征的决策边界可视化
#准备数据 Xs = np.array([[1, 50], [5, 20], [3, 80], [5, 60]]).astype(np.float64) ys = np.array([0, 0, 1, 1]) #实例化模型 svm_clf = SVC(kernel="linear", C=100) svm_clf.fit(Xs, ys) #绘制图形 plt.figure(figsize=(6,4)) plt.plot(Xs[:, 0][ys == 1], Xs[:, 1][ys == 1], "bo") plt.plot(Xs[:, 0][ys == 0], Xs[:, 1][ys == 0], "ms") plot_svc_decision_boundary(svm_clf, 0, 6) plt.xlabel("$x_0$", fontsize=20) plt.ylabel("$x_1$ ", fontsize=20, rotation=0) plt.title("Unscaled", fontsize=16) plt.axis([0, 6, 0, 90])
(0.0, 6.0, 0.0, 90.0)
5.1 标准化特征的决策边界可视化
from sklearn.preprocessing import StandardScaler scaler = StandardScaler() X_scaled = scaler.fit_transform(Xs) svm_clf.fit(X_scaled, ys) plt.plot(X_scaled[:, 0][ys == 1], X_scaled[:, 1][ys == 1], "bo") plt.plot(X_scaled[:, 0][ys == 0], X_scaled[:, 1][ys == 0], "ms") plot_svc_decision_boundary(svm_clf, -2, 2) plt.xlabel("$x_0$", fontsize=20) plt.title("Scaled", fontsize=16) plt.axis([-2, 2, -2, 2]) plt.show()
实例6 回到鸢尾花数据集
#回到鸢尾花数据集 X = iris["data"][:, (2, 3)] # petal length, petal width y = iris["target"]
X_outliers = np.array([[3.4, 1.3], [3.2, 0.8]]) y_outliers = np.array([0, 0]) Xo1 = np.concatenate([X, X_outliers[:1]], axis=0) yo1 = np.concatenate([y, y_outliers[:1]], axis=0) Xo2 = np.concatenate([X, X_outliers[1:]], axis=0) yo2 = np.concatenate([y, y_outliers[1:]], axis=0) svm_clf1= SVC(kernel="linear", C=10**9) svm_clf1.fit(Xo1, yo1) plt.figure(figsize=(12, 4)) plt.subplot(121) plt.plot(Xo1[:, 0][yo1 == 1], Xo1[:, 1][yo1 == 1], "bs") plt.plot(Xo1[:, 0][yo1 == 0], Xo1[:, 1][yo1 == 0], "yo") plt.text(0.3, 1.0, "Impossible!", fontsize=24, color="red") plot_svc_decision_boundary(svm_clf1, 0, 5.5) plt.xlabel("Petal length", fontsize=14) plt.ylabel("Petal width", fontsize=14) plt.annotate( "Outlier", xy=(X_outliers[0][0], X_outliers[0][1]), xytext=(2.5, 1.7), ha="center", arrowprops=dict(facecolor='black', shrink=0.1), fontsize=16, ) plt.axis([0, 5.5, 0, 2]) svm_clf2 = SVC(kernel="linear", C=10**9) svm_clf2.fit(Xo2, yo2) plt.subplot(122) plt.plot(Xo2[:, 0][yo2 == 1], Xo2[:, 1][yo2 == 1], "bs") plt.plot(Xo2[:, 0][yo2 == 0], Xo2[:, 1][yo2 == 0], "yo") plot_svc_decision_boundary(svm_clf2, 0, 5.5) plt.xlabel("Petal length", fontsize=14) plt.annotate( "Outlier", xy=(X_outliers[1][0], X_outliers[1][1]), xytext=(3.2, 0.08), ha="center", arrowprops=dict(facecolor='black', shrink=0.1), fontsize=16, ) plt.axis([0, 5.5, 0, 2]) plt.show() plt.show()
实例7 非线性可分的决策边界
7.1 做一个新的数据
from sklearn.pipeline import Pipeline
from sklearn.datasets import make_moons X, y = make_moons(n_samples=100, noise=0.15, random_state=42)
np.min(X[:,0]),np.max(X[:,0])
(-1.2720155884887554, 2.4093807207967215)
np.min(X[:,1]),np.max(X[:,1])
(-0.6491427462708279, 1.2711135917248466)
x0s = np.linspace(2, 15, 2) x1s = np.linspace(3,12,2) x0, x1 = np.meshgrid(x0s, x1s) x0s ,x1s ,x0, x1
(array([ 2., 15.]), array([ 3., 12.]), array([[ 2., 15.], [ 2., 15.]]), array([[ 3., 3.], [12., 12.]]))
x1.ravel()
array([ 3., 3., 12., 12.])
x0.ravel()
array([ 2., 15., 2., 15.])
X = np.c_[x0.ravel(), x1.ravel()] X.shape,X
((4, 2), array([[ 2., 3.], [15., 3.], [ 2., 12.], [15., 12.]]))
y_pred=np.array([[1,0],[0,1]])
np.meshgrid(x0s, x1s)
[array([[ 2., 15.], [ 2., 15.]]), array([[ 3., 3.], [12., 12.]])]
X = np.c_[x0.ravel(), x1.ravel()] X.shape,x0.shape
((4, 2), (2, 2))
x0
array([[ 2., 15.], [ 2., 15.]])