基于机器学习模型预测信用卡潜在用户(XGBoost、LightGBM和Random Forest)(一):https://developer.aliyun.com/article/1535285
Second Attempt: XG Boost Classifer
from sklearn.utils import class_weight class_weight.compute_class_weight('balanced', np.unique(yc_train), yc_train["Is_Lead"]) weights = np.ones(y_train.shape[0], dtype = 'float') for i, val in enumerate(y_train): weights[i] = classes_weights[val-1] xgb_classifier.fit(X, y, sample_weight=weights)
#Trying XGBoost import xgboost as xg from xgboost import XGBClassifier from sklearn.utils import class_weight clf2 = xg.XGBClassifier(class_weight='balanced').fit(xc_train, yc_train) class_weight.compute_class_weight('balanced', np.unique(yc_train), yc_train["Is_Lead"]) xg_pred = clf2.predict(xc_test)
[23:35:16] WARNING: /private/var/folders/fc/8d9mxh2s4ssd8k64mkmlsrj00000gn/T/pip-req-build-y40nwdrb/build/temp.macosx-10.9-x86_64-3.8/xgboost/src/learner.cc:576: Parameters: { "class_weight" } might not be used. This may not be accurate due to some parameters are only used in language bindings but passed down to XGBoost core. Or some parameters are not used but slip through this verification. Please open an issue if you find above cases. [23:35:16] WARNING: /private/var/folders/fc/8d9mxh2s4ssd8k64mkmlsrj00000gn/T/pip-req-build-y40nwdrb/build/temp.macosx-10.9-x86_64-3.8/xgboost/src/learner.cc:1100: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
plt.rcParams['figure.figsize'] = (12,8)
#XG Boost Results xg_pred_2=clf2.predict_proba(xc_test)[:,1] Fpr,Tpr,thresholds = roc_curve(yc_test,xg_pred_2,pos_label=True) auc = roc_auc_score(yc_test,xg_pred_2) print(" ROC_AUC score is ",auc) print("accuracy score is : ",accuracy_score(yc_test,xg_pred)) print("Precision is : " ,precision_score(yc_test, xg_pred)) print("Recall is: " ,recall_score(yc_test, xg_pred)) print("F1 Score is : " ,f1_score(yc_test, xg_pred)) print("classification report \n",classification_report(yc_test,xg_pred)) cnf = confusion_matrix(yc_test,xg_pred) sns.heatmap(cnf, annot=True, cmap = "magma")
ROC_AUC score is 0.8706238059470456 accuracy score is : 0.8033968090581575 Precision is : 0.8246741325500275 Recall is: 0.7706296105678504 F1 Score is : 0.7967364313586378 classification report precision recall f1-score support 0.0 0.78 0.84 0.81 11658 1.0 0.82 0.77 0.80 11658 accuracy 0.80 23316 macro avg 0.80 0.80 0.80 23316 weighted avg 0.80 0.80 0.80 23316 <AxesSubplot:>
plt.rcParams['figure.figsize'] = (12,6)
#plotting the graph for area under curve for representing accuracy of data plt.plot([0,1],[1,0],'g--') plt.plot(Fpr,Tpr) plt.xlabel('False_Positive_Rate') plt.ylabel('True_Positive_Rate') plt.title("XG_Boost Classifier") plt.show()
Third Attempt: LGBM Model with Stratification Folds
#Trying stratification modeling from sklearn.model_selection import KFold, StratifiedKFold def cross_val(xc, yc, model, params, folds=10): skf = StratifiedKFold(n_splits=folds, shuffle=True, random_state=42) for fold, (train_idx, test_idx) in enumerate(skf.split(xc, yc)): print(f"Fold: {fold}") xc_train, yc_train = xc.iloc[train_idx], yc.iloc[train_idx] xc_test, yc_test = xc.iloc[test_idx], yc.iloc[test_idx] model_c= model(**params) model_c.fit(xc_train, yc_train,eval_set=[(xc_test, yc_test)],early_stopping_rounds=100, verbose=300) pred_y = model_c.predict_proba(xc_test)[:, 1] roc_score = roc_auc_score(yc_test, pred_y) print(f"roc_auc_score: {roc_score}") print("-"*50) return model_c
#Applying LGBM Model with 10 stratified cross-folds from lightgbm import LGBMClassifier lgb_params= {'learning_rate': 0.045, 'n_estimators': 10000,'max_bin': 84,'num_leaves': 10,'max_depth': 20,'reg_alpha': 8.457,'reg_lambda': 6.853,'subsample': 0.749} lgb_model = cross_val(xc, yc, LGBMClassifier, lgb_params)
Fold: 0 Training until validation scores don't improve for 100 rounds [300] valid_0's binary_logloss: 0.433821 [600] valid_0's binary_logloss: 0.433498 Early stopping, best iteration is: [599] valid_0's binary_logloss: 0.433487 roc_auc_score: 0.8748638095718249 -------------------------------------------------- Fold: 1 Training until validation scores don't improve for 100 rounds [300] valid_0's binary_logloss: 0.434881 [600] valid_0's binary_logloss: 0.43445 Early stopping, best iteration is: [569] valid_0's binary_logloss: 0.43442 roc_auc_score: 0.8755631159104413 -------------------------------------------------- Fold: 2 Training until validation scores don't improve for 100 rounds [300] valid_0's binary_logloss: 0.431872 [600] valid_0's binary_logloss: 0.43125 [900] valid_0's binary_logloss: 0.430984 Early stopping, best iteration is: [1013] valid_0's binary_logloss: 0.430841 roc_auc_score: 0.877077541404848 -------------------------------------------------- Fold: 3 Training until validation scores don't improve for 100 rounds [300] valid_0's binary_logloss: 0.442048 [600] valid_0's binary_logloss: 0.44142 [900] valid_0's binary_logloss: 0.441142 Early stopping, best iteration is: [895] valid_0's binary_logloss: 0.44114 roc_auc_score: 0.8721270953106521 -------------------------------------------------- Fold: 4 Training until validation scores don't improve for 100 rounds [300] valid_0's binary_logloss: 0.439466 [600] valid_0's binary_logloss: 0.438899 Early stopping, best iteration is: [782] valid_0's binary_logloss: 0.438824 roc_auc_score: 0.8709229804739002 -------------------------------------------------- Fold: 5 Training until validation scores don't improve for 100 rounds [300] valid_0's binary_logloss: 0.427545 Early stopping, best iteration is: [445] valid_0's binary_logloss: 0.42739 roc_auc_score: 0.8792290845510382 -------------------------------------------------- Fold: 6 Training until validation scores don't improve for 100 rounds [300] valid_0's binary_logloss: 0.440554 [600] valid_0's binary_logloss: 0.439762 [900] valid_0's binary_logloss: 0.439505 [1200] valid_0's binary_logloss: 0.439264 Early stopping, best iteration is: [1247] valid_0's binary_logloss: 0.439142 roc_auc_score: 0.872610593872283 -------------------------------------------------- Fold: 7 Training until validation scores don't improve for 100 rounds [300] valid_0's binary_logloss: 0.423764 Early stopping, best iteration is: [414] valid_0's binary_logloss: 0.423534 roc_auc_score: 0.8806521642373888 -------------------------------------------------- Fold: 8 Training until validation scores don't improve for 100 rounds [300] valid_0's binary_logloss: 0.440673 Early stopping, best iteration is: [409] valid_0's binary_logloss: 0.440262 roc_auc_score: 0.8708570312002339 -------------------------------------------------- Fold: 9 Training until validation scores don't improve for 100 rounds [300] valid_0's binary_logloss: 0.441536 [600] valid_0's binary_logloss: 0.441034 Early stopping, best iteration is: [661] valid_0's binary_logloss: 0.440952 roc_auc_score: 0.8713195377336685 --------------------------------------------------
#LGBM results lgb_pred_2=clf2.predict_proba(xc_test)[:,1] Fpr,Tpr,thresholds = roc_curve(yc_test,lgb_pred_2,pos_label=True) auc = roc_auc_score(yc_test,lgb_pred_2) print(" ROC_AUC score is ",auc) lgb_model.fit(xc_train,yc_train) lgb_pred=lgb_model.predict(xc_test) print("accuracy score is : ",accuracy_score(yc_test,lgb_pred)) print("Precision is : " ,precision_score(yc_test, lgb_pred)) print("Recall is: " ,recall_score(yc_test, lgb_pred)) print("F1 Score is : " ,f1_score(yc_test, lgb_pred)) print("classification report \n",classification_report(yc_test,lgb_pred)) cnf = confusion_matrix(yc_test,lgb_pred) sns.heatmap(cnf, annot=True, cmap = "magma")
ROC_AUC score is 0.8706238059470456 accuracy score is : 0.8030965860353405 Precision is : 0.8258784469242829 Recall is: 0.7681420483787956 F1 Score is : 0.7959646237944981 classification report precision recall f1-score support 0.0 0.78 0.84 0.81 11658 1.0 0.83 0.77 0.80 11658 accuracy 0.80 23316 macro avg 0.80 0.80 0.80 23316 weighted avg 0.80 0.80 0.80 23316 <AxesSubplot:>
plt.rcParams['figure.figsize'] = (12,6)
#plotting the graph for area under curve for representing accuracy of data plt.plot([0,1],[1,0],'g--') plt.plot(Fpr,Tpr) plt.xlabel('False_Positive_Rate') plt.ylabel('True_Positive_Rate') plt.title("LGB Classifier model") plt.show()
5. 模型预测
模型训练完成后,我们使用测试数据进行预测:
#we can drop column as they are irrelevant and have no effect on our data df_3 = df_test
df_3.drop(columns=["source"],inplace=True) df_3.head()
# dropping target variable #assign the value of y for training and testing phase xc_pred = df_3.drop(columns=['Is_Lead',"ID"]) #Standardizing value of x by using standardscaler to make the data normally distributed sc = StandardScaler() df_xc_pred = pd.DataFrame(sc.fit_transform(xc_pred),columns=xc_pred.columns)
lead_pred_xg=clf2.predict_proba(df_xc_pred)[:,1] lead_pred_lgb=lgb_model.predict_proba(df_xc_pred)[:,1] lead_pred_rf=rf_clf.predict_proba(df_xc_pred)[:,1] print(lead_pred_xg, lead_pred_lgb, lead_pred_rf)
[0.09673516 0.9428428 0.12728807 ... 0.31698707 0.1821623 0.17593904] [0.14278614 0.94357392 0.13603912 ... 0.22251432 0.24186564 0.16873483] [0.17 0.97 0.09 ... 0.5 0.09 0.15]
#Dataframe for lead prediction lead_pred_lgb= pd.DataFrame(lead_pred_lgb,columns=["Is_Lead"]) lead_pred_xg= pd.DataFrame(lead_pred_xg,columns=["Is_Lead"]) lead_pred_rf= pd.DataFrame(lead_pred_rf,columns=["Is_Lead"])
df_test = df_test.reset_index() df_test.head()
#Saving ID and prediction to csv file for XG Model df_pred_xg=pd.concat([df_test["ID"],lead_pred_xg],axis=1,ignore_index=True) df_pred_xg.columns = ["ID","Is_Lead"] print(df_pred_xg.head()) df_pred_xg.to_csv("Credit_Card_Lead_Predictions_final_xg.csv",index=False) #Saving ID and prediction to csv file for LGB Model df_pred_lgb=pd.concat([df_test["ID"],lead_pred_lgb],axis=1,ignore_index=True) df_pred_lgb.columns = ["ID","Is_Lead"] print(df_pred_lgb.head()) df_pred_lgb.to_csv("Credit_Card_Lead_Predictions_final_lgb.csv",index=False) #Saving ID and prediction to csv file for RF model df_pred_rf=pd.concat([df_test["ID"],lead_pred_rf],axis=1,ignore_index=True) df_pred_rf.columns = ["ID","Is_Lead"] print(df_pred_rf.head()) df_pred_rf.to_csv("Credit_Card_Lead_Predictions_final_rf.csv",index=False)
ID Is_Lead 0 VBENBARO 0.096735 1 CCMEWNKY 0.942843 2 VK3KGA9M 0.127288 3 TT8RPZVC 0.052260 4 SHQZEYTZ 0.057762 ID Is_Lead 0 VBENBARO 0.142786 1 CCMEWNKY 0.943574 2 VK3KGA9M 0.136039 3 TT8RPZVC 0.084144 4 SHQZEYTZ 0.055887 ID Is_Lead 0 VBENBARO 0.17 1 CCMEWNKY 0.97 2 VK3KGA9M 0.09 3 TT8RPZVC 0.12 4 SHQZEYTZ 0.09
6. 模型保存
为了在未来能够方便地加载和使用训练好的模型,我们将模型保存为pickle文件:
import joblib # 将模型保存为文件中的pickle joblib.dump(lgb_model,'lgb_model.pkl')
['lgb_model.pkl']