Chp8-2
2019 年 12 月 23 日
In [3]: from sklearn.model_selection import train_test_split import numpy as np import pandas as pd from matplotlib import pyplot as plt df=pd.read_csv('C:\Python\Scripts\my_data\iris.csv',header=None, names=['sepal_length','sepal_width','petal_length',' petal_width','target']) my_data=df[['sepal_length',' sepal_width']].iloc[:50] def rmse(x,y,coefs): # 注意,自定义函数的语法 yfit=np.polyval(coefs,x) rmse=np.sqrt(np.mean((y-yfit)**2)) return rmse xtrain,xtest,ytrain,ytest=train_test_split(my_data['sepal_length'],my_data[' sepal_width'],test_size = 0.5) train_err=[] validation_err=[] degrees=range(1,8) for i,d in enumerate(degrees): p=np.polyfit(xtrain,ytrain,d) train_err.append(rmse(xtrain,ytrain,p)) validation_err.append(rmse(xtest,ytest,p)) 1fig,ax=plt.subplots() ax.plot(degrees,validation_err,lw=2,label='testing error') ax.plot(degrees,train_err,lw=2,label='training error') ax.legend(loc=0) ax.set_xlabel('degree of polynomial') ax.set_ylabel('RMSE') Out[3]: Text(0,0.5,'RMSE')
In [54]: from sklearn.model_selection import KFold my_data=df[['sepal_length','sepal_width']] nfolds=3 fig,axes=plt.subplots(1,nfolds,figsize=(14,4)) kf=KFold(n_splits =nfolds) i=0 for training, validation in kf.split(my_data): x,y=my_data.iloc[training]['sepal_length'],df.iloc[training]['sepal_width'] axes[i].plot(x,y,'ro') x,y=my_data.iloc[validation]['sepal_length'],df.iloc[validation][' sepal_width'] axes[i].plot(x,y,'bo') i=i+1 plt.tight_layout()
In [61]: my_class=[] for n in range(150): if n<50: my_class.append(1) elif n<100: my_class.append(2) else: my_class.append(3) print(my_class) [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3] In [65]: from sklearn.model_selection import cross_val_score,train_test_split from sklearn.neighbors import KNeighborsClassifier knn1 = KNeighborsClassifier(n_neighbors=1) knn2 = KNeighborsClassifier(n_neighbors=1) knn1.fit(my_data[['sepal_length','sepal_width']],my_class) # 全部数据用来训练 print('训练集测试集相同时,模型的性能得分是: ',knn1.score(my_data[['sepal_length',' sepal_width']],my_class))# 在训练集上评价性能 print('\n')scores= cross_val_score(knn2,my_data[['sepal_length','sepal_width']],my_class,cv=5, scoring='accuracy') # 交叉验证 print('5 折交叉验证时,模型的性能平均得分是: ', scores.mean()) 训练集测试集相同时,模型的性能得分是: 0.9266666666666666 5 折交叉验证时,模型的性能平均得分是: 0.7266666666666667