代码实践
参数详解
一下参数可以提高准确率
learning_rate:学习率.
默认值:0.1
调参策略:最开始可以设置得大一些,如0.1。调整完其他参数之后最后再将此参数调小。
取值范围:0.01~0.3.
max_depth:树模型深度
默认值:-1
调整策略:无
取值范围:3-8(不超过10)
num_leaves:叶子节点数,数模型复杂度
降低过拟合
max_bin:工具箱数(叶子结点数+非叶子节点数?)
bin的最大数 决定 特征的最大组数(类似特征会被组合)
小的bin数量会降低训练精度(accuracy),但是可能可以提高泛化性能(genreal power)
LightGBM 将根据 max_bin 自动压缩内存。 例如, 如果 maxbin=255, 那么 LightGBM 将使用 uint8t 的特性值
min_data_in_leaf:一个叶子上数据的最小数量. 可以用来处理过拟合
默认值:20
调参策略:搜索,尽量不要太大。
feature_fraction:每次迭代中随机选择特征的比例。
默认值:1.0
调参策略:0.5-0.9之间调节。
可以用来加速训练
可以用来处理过拟合
bagging_fraction:不进行重采样的情况下随机选择部分数据
默认值:1.0
调参策略:0.5-0.9之间调节。
可以用来加速训练
可以用来处理过拟合
bagging_freq:bagging的次数。0表示禁用bagging,非零值表示执行k次bagging
默认值:0
调参策略:3-5
其他
lambda_l1:L1正则
lambda_l2:L2正则
min_split_gain:执行切分的最小增益
默认值:0.1
代码实操
#导入所需要的包 from sklearn.metrics import precision_score from sklearn.model_selection import train_test_split from sklearn.svm import SVC from sklearn.linear_model import LogisticRegression from sklearn.ensemble import RandomForestClassifier import numpy as np import pandas as pd import matplotlib.pyplot as plt from matplotlib.colors import ListedColormap from sklearn.preprocessing import LabelEncoder from sklearn.metrics import classification_report#评估报告 from sklearn.model_selection import cross_val_score #交叉验证 from sklearn.model_selection import GridSearchCV #网格搜索 import matplotlib.pyplot as plt#可视化 import seaborn as sns#绘图包 from sklearn.preprocessing import StandardScaler,MinMaxScaler,MaxAbsScaler#归一化,标准化 # 忽略警告 import warnings warnings.filterwarnings("ignore") from sklearn.metrics import precision_score import lightgbm as lgb
最优模型及参数(数据集1000)
有的小伙伴会有疑问,咋我们的lightGBM效果没有XGBoost好呀,原因出在我们的数据上,因为这个是一个小的数据集,效果可以达到这个,完全是不断迭代优化参数的效果。
df=pd.read_csv(r"数据.csv") X=df.iloc[:,:-1] y=df.iloc[:,-1] X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,stratify=y,random_state=1) model=lgb.LGBMClassifier(n_estimators=39,max_depth=8,num_leaves=12,max_bin=7,min_data_in_leaf=10,bagging_fraction=0.5, feature_fraction=0.59,boosting_type="gbdt",application="binary",min_split_gain=0.15, n_jobs=-1,bagging_freq=30,lambda_l1=1e-05,lambda_l2=1e-05,learning_rate=0.1, random_state=90) model.fit(X_train,y_train) # 预测值 y_pred = model.predict(X_test) ''' 评估指标 ''' # # 求出预测和真实一样的数目 true = np.sum(y_pred == y_test ) print('预测对的结果数目为:', true) print('预测错的的结果数目为:', y_test.shape[0]-true) # 评估指标 from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score,cohen_kappa_score print('预测数据的准确率为: {:.4}%'.format(accuracy_score(y_test,y_pred)*100)) print('预测数据的精确率为:{:.4}%'.format( precision_score(y_test,y_pred)*100)) print('预测数据的召回率为:{:.4}%'.format( recall_score(y_test,y_pred)*100)) # print("训练数据的F1值为:", f1score_train) print('预测数据的F1值为:', f1_score(y_test,y_pred)) print('预测数据的Cohen’s Kappa系数为:', cohen_kappa_score(y_test,y_pred)) # 打印分类报告 print('预测数据的分类报告为:','\n', classification_report(y_test,y_pred)) # ROC曲线、AUC from sklearn.metrics import precision_recall_curve from sklearn import metrics # 预测正例的概率 y_pred_prob=model.predict_proba(X_test)[:,1] # y_pred_prob ,返回两列,第一列代表类别0,第二列代表类别1的概率 #https://blog.csdn.net/dream6104/article/details/89218239 fpr, tpr, thresholds = metrics.roc_curve(y_test,y_pred_prob, pos_label=2) #pos_label,代表真阳性标签,就是说是分类里面的好的标签,这个要看你的特征目标标签是0,1,还是1,2 roc_auc = metrics.auc(fpr, tpr) #auc为Roc曲线下的面积 # print(roc_auc) plt.figure(figsize=(8,6)) plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.plot(fpr, tpr, 'r',label='AUC = %0.2f'% roc_auc)
plt.legend(loc='lower right')
# plt.plot([0, 1], [0, 1], 'r--')
plt.xlim([0, 1.1])
plt.ylim([0, 1.1])
plt.xlabel('False Positive Rate') #横坐标是fpr
plt.ylabel('True Positive Rate') #纵坐标是tpr
plt.title('Receiver operating characteristic example')
plt.show()
模型调参
尝试学习曲线进行调参
初始化我们的参数,也可以通过在训练集上的网格搜索确定大致的参数位置,然后利用学习曲线去迭代最佳的参数
model=lgb.LGBMClassifier(boosting_type='gbdt',objective='binary',metrics='auc',learning_rate=0.01, n_estimators=39, max_depth=4, num_leaves=12,max_bin=15,min_data_in_leaf=11,bagging_fraction=0.8,bagging_freq=20, feature_fraction= 0.7, lambda_l1=1e-05,lambda_l2=1e-05,min_split_gain=0.5)
例如:
params_test5={'min_split_gain':[0.0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0]} gsearch5 = GridSearchCV(estimator = lgb.LGBMClassifier(boosting_type='gbdt',objective='binary',metrics='auc',learning_rate=0.01, n_estimators=1000, max_depth=4, num_leaves=12,max_bin=15,min_data_in_leaf=11, bagging_fraction=0.8,bagging_freq=20, feature_fraction= 0.7, lambda_l1=1e-05,lambda_l2=1e-05,min_split_gain=0.5), param_grid = params_test5, scoring='roc_auc',cv=5) gsearch5.fit(X_train,y_train) gsearch5.best_params_, gsearch5.best_score_
学习曲线
scorel = [] for i in range(0,200,10): model = lgb.LGBMClassifier(n_estimators=i+1,random_state=2022).fit(X_train,y_train) score = model.score(X_test,y_test) scorel.append(score) print(max(scorel),(scorel.index(max(scorel))*10)+1) #作图反映出准确度随着估计器数量的变化,110的附近最好 plt.figure(figsize=[20,5]) plt.plot(range(1,200,10),scorel) plt.show() ## 根据上面的显示最优点在51附近,进一步细化学习曲线 scorel = [] for i in range(35,45): RFC = lgb.LGBMClassifier(n_estimators=i, n_jobs=-1, random_state=90).fit(X_train,y_train) score = RFC.score(X_test,y_test) scorel.append(score) print(max(scorel),([*range(35,45)][scorel.index(max(scorel))])) #112是最优的估计器数量 #最优得分是0.98945 plt.figure(figsize=[20,5]) plt.plot(range(35,45),scorel) plt.show()
max_depth
scorel = [] for i in range(3,20): RFC = lgb.LGBMClassifier(n_estimators=39,max_depth=i, n_jobs=-1, random_state=90).fit(X_train,y_train) score = RFC.score(X_test,y_test) scorel.append(score) print(max(scorel),([*range(3,20)][scorel.index(max(scorel))])) #112是最优的估计器数量 #最优得分是0.98945 plt.figure(figsize=[20,5]) plt.plot(range(3,20),scorel) plt.show()
整数区间参数调优(手动修改即可)
scorel = [] for i in np.arange(7,45,1): RFC = lgb.LGBMClassifier(n_estimators=39,max_depth=8,num_leaves=12,max_bin=7,min_data_in_leaf=10,bagging_fraction=0.5,feature_fraction=0.6, n_jobs=-1,bagging_freq=30, random_state=90).fit(X_train,y_train) score = RFC.score(X_test,y_test) scorel.append(score) print(max(scorel),([*np.arange(7,45,1)][scorel.index(max(scorel))])) #112是最优的估计器数量 #最优得分是0.98945 plt.figure(figsize=[20,5]) plt.plot(np.arange(7,45,1),scorel) plt.show() # num_leaves=12,max_bin=15,min_data_in_leaf=11,bagging_fraction=0.8,bagging_freq=20, feature_fraction= 0.7,
浮点数参数(手动修改)
scorel = [] for i in np.arange(0.01,1,0.01): RFC = lgb.LGBMClassifier(n_estimators=39,max_depth=8,num_leaves=12,max_bin=7,min_data_in_leaf=10,bagging_fraction=0.5, feature_fraction=0.59,min_split_gain=i, n_jobs=-1,bagging_freq=30, random_state=90).fit(X_train,y_train) score = RFC.score(X_test,y_test) scorel.append(score) print(max(scorel),([*np.arange(0.01,1,0.01)][scorel.index(max(scorel))])) #112是最优的估计器数量 #最优得分是0.98945 plt.figure(figsize=[20,5]) plt.plot(np.arange(0.01,1,0.01),scorel) plt.show()