声明:本文示例来自于GitHub用户vkasojhaa的项目,一切权利归其所有,此处仅是自己学习分享。
实现了基于机器学习的乳腺癌的恶性和良性预测,比较了不同机器学习算法之间的性能。主要目的是评估在每种算法的准确性和效率方面对数据进行分类的正确性。
基于机器学习的乳腺癌预测
代码示例代码示例
#导入依赖库 #!/usr/bin/python3 import numpy as np import pandas as pd import matplotlib.pyplot as plt import matplotlib.gridspec as gridspec import seaborn as sns import time %matplotlib inline #Import models from scikit learn module: from sklearn.model_selection import train_test_split from sklearn.linear_model import LogisticRegression from sklearn.model_selection import KFold from sklearn.model_selection import cross_val_score from sklearn.cross_validation import KFold from sklearn.neighbors import KNeighborsClassifier from sklearn.tree import DecisionTreeClassifier from sklearn.model_selection import GridSearchCV from sklearn.preprocessing import StandardScaler from sklearn.model_selection import GridSearchCV from sklearn.pipeline import Pipeline from sklearn.svm import SVC from sklearn import metrics
#载入数据 data = pd.read_csv("data.csv") data.head()
#数据预处理 data.drop('id',axis=1,inplace=True) #移除id data.drop('Unnamed: 32',axis=1,inplace=True) print("Row, Col", data.shape)# (row,col)
#数据标记,M:恶性,B:良性 data['diagnosis'] = data['diagnosis'].map({'M':1,'B':0}) data.head()
#挖掘数据 data.describe()
探索数据
data.plot(kind='density', subplots=True, layout=(5,7), sharex=False, legend=False, fontsize=1) plt.show()
print(data.groupby('diagnosis').size()) sns.countplot(data['diagnosis'],label="Count") plt.show()
#划分训练集和测试集 traindf, testdf = train_test_split(data, test_size = 0.3) labels = 'Train', 'Test' plt.pie([70, 30], labels=labels, autopct='%1.1f%%', shadow=True) plt.show() print("Train set", traindf.shape) print("Test set", testdf.shape)
features_mean= list(data.columns[1:11]) corr = data[features_mean].corr() plt.figure(figsize=(14,14)) sns.heatmap(corr, cbar = True, square = True, annot=True, fmt= '.2f',annot_kws={'size': 15}, xticklabels= features_mean, yticklabels= features_mean, cmap= 'coolwarm') plt.show()
#用于模型分类和访问性能的通用函数。 Y = data['diagnosis'].values X = data.drop('diagnosis', axis=1).values X_train, X_test, Y_train, Y_test = train_test_split (X, Y, test_size = 0.30, random_state=21) def classification_model(model, data, predictors, outcome): #拟合模型 model.fit(data[predictors],data[outcome]) #对训练集进行预测 predictions = model.predict(data[predictors]) #输出准确性 accuracy = metrics.accuracy_score(predictions,data[outcome]) print("Accuracy : %s" % "{0:.3%}".format(accuracy)) #Perform k-fold cross-validation with 5 folds kfold = KFold(data.shape[0], n_folds=5) error = [] for train, test in kfold: #过滤数据 train_predictors = (data[predictors].iloc[train,:]) # 目的在于训练算法 train_target = data[outcome].iloc[train] # 使用预测变量和目标训练算法。 model.fit(train_predictors, train_target) #记录每次交叉验证运行的错误 error.append(model.score(data[predictors].iloc[test,:], data[outcome].iloc[test])) cv_results = cross_val_score(model, X_train, Y_train, cv=kfold, scoring='accuracy') print("Cross-Validation Score : %s" % "{0:.3%}".format(np.mean(error))) #Fit the model again so that it can be refered outside the function: model.fit(data[predictors],data[outcome])
逻辑回归模型
predictor_var = ['texture_mean','perimeter_mean','smoothness_mean','compactness_mean','symmetry_mean'] outcome_var='diagnosis' model=LogisticRegression() classification_model(model,traindf,predictor_var,outcome_var)
Accuracy : 91.206% Cross-Validation Score : 90.206%
决策树模型
predictor_var = ['texture_mean','perimeter_mean','smoothness_mean','compactness_mean','symmetry_mean'] model = DecisionTreeClassifier() classification_model(model,traindf,predictor_var,outcome_var)
Accuracy : 100.000% Cross-Validation Score : 87.446%
predictor_var = ['texture_mean'] model = DecisionTreeClassifier() classification_model(model,traindf,predictor_var,outcome_var)
Accuracy : 94.472% Cross-Validation Score : 87.937%
Accuracy : 96.231% Cross-Validation Score : 66.329%
k一近邻模型
predictor_var = ['texture_mean','perimeter_mean','smoothness_mean','compactness_mean','symmetry_mean'] model= KNeighborsClassifier() classification_model(model,traindf,predictor_var,outcome_var)
Accuracy : 92.462% Cross-Validation Score : 89.456%
支持向量机模型
predictor_var = ['texture_mean','perimeter_mean','smoothness_mean','compactness_mean','symmetry_mean'] model= SVC() classification_model(model,traindf,predictor_var,outcome_var)
#几种机器学习模型的性能比较 Y = data['diagnosis'].values X = data.drop('diagnosis', axis=1).values X_train, X_test, Y_train, Y_test = train_test_split (X, Y, test_size = 0.30, random_state=21) models_list = [] models_list.append(('LR', LogisticRegression())) models_list.append(('DT', DecisionTreeClassifier())) models_list.append(('SVM', SVC())) models_list.append(('KNN', KNeighborsClassifier())) num_folds = 10 results = [] names = [] for name, model in models_list: start = time.time() cv_results = cross_val_score(model, X_train, Y_train, cv=num_folds, scoring='accuracy') end = time.time() results.append(cv_results) names.append(name) print( "%s:(run time: %f)"% (name, end-start))
LR:(run time: 0.069959) DT:(run time: 0.047665) SVM:(run time: 0.156240) KNN:(run time: 0.029838)