三、卡方检验
取三个特征的结果:
chivalue, pvalues_chi = chi2(X_fsvar,y) chivalue cross_val_score(RFC(n_estimators=10,random_state=0),X_fschi,y,cv=3).mean()
chivalue, pvalues_chi = chi2(X_fsvar,y) chivalue cross_val_score(RFC(n_estimators=10,random_state=0),X_fschi,y,cv=3).mean()
取全部特征的结果:
四、F检验
# F检验 from sklearn.feature_selection import f_classif F,pvalues_f=f_classif(X_fsvar,y) F
k=F.shape[0]-(pvalues_f>0.05).sum() X_fsF=SelectKBest(f_classif,k=k).fit_transform(X_fsvar,y) cross_val_score(RFC(n_estimators=10,random_state=0),X_fsF,y,cv=3).mean()
from sklearn.feature_selection import f_classif F,pvalues_f=f_classif(X,y) F
k=F.shape[0]-(pvalues_f>0.05).sum() X_fsF=SelectKBest(f_classif,k=k).fit_transform(X,y) cross_val_score(RFC(n_estimators=10,random_state=0),X_fsF,y,cv=9).mean()
五、互信息法
# 4互信息法 from sklearn.feature_selection import mutual_info_classif as MIC result=MIC(X_fsvar,y) k=result.shape[0]-sum(result<=0) X_fsmic=SelectKBest(MIC,k=k).fit_transform(X_fsvar,y) cross_val_score(RFC(n_estimators=10,random_state=0),X_fsmic,y,cv=5).mean()
六、嵌入法
# 6 嵌入法 from sklearn.feature_selection import SelectFromModel from sklearn.ensemble import RandomForestClassifier as RFC RFC_=RFC(n_estimators=10,random_state=0) X_embedded=SelectFromModel(RFC_,threshold=0.005).fit_transform(X,y) X_embedded.shape # 结果:可以减去两个特征,剩下7个
#通过学习曲线获取阈值 import numpy as np import matplotlib.pyplot as plt RFC_.fit(X,y).feature_importances_ threshold = np.linspace(0,(RFC_.fit(X,y).feature_importances_).max(),20) score = [] for i in threshold: X_embedded = SelectFromModel(RFC_,threshold=i).fit_transform(X,y) once = cross_val_score(RFC_,X_embedded,y,cv=5).mean() score.append(once) plt.plot(threshold,score) plt.show()
X_embedded=SelectFromModel(RFC_,threshold=0.005).fit_transform(X,y) X_embedded.shape cross_val_score(RFC_,X_embedded,y,cv=5).mean()
将正确率高的学习曲线进行放大
# 通过学习曲线选取 #liyupudata score2 = [] for i in np.linspace(0,0.112,20): X_embedded = SelectFromModel(RFC_,threshold=i).fit_transform(X,y) once = cross_val_score(RFC_,X_embedded,y,cv=5).mean() score2.append(once) plt.figure(figsize=[20,5]) plt.plot(np.linspace(0,0.112,20),score2) plt.xticks(np.linspace(0,0.112,20)) plt.show()
七、包装法
#liyupudata from sklearn.feature_selection import RFE RFC_ = RFC(n_estimators =100,random_state=0) selector = RFE(RFC_, n_features_to_select=9, step=50).fit(X, y) selector.support_.sum() selector.ranking_ X_wrapper = selector.transform(X) cross_val_score(RFC_,X_wrapper,y,cv=5).mean()
# 通过学习曲线选取 #liyupudata score = [] for i in range(1,9,1): X_wrapper = RFE(RFC_,n_features_to_select=i, step=50).fit_transform(X,y) once = cross_val_score(RFC_,X_wrapper,y,cv=5).mean() score.append(once) plt.figure(figsize=[20,5]) plt.plot(range(1,9,1),score) plt.xticks(range(1,9,1)) plt.show()