3. 数据预处理
3.1 scaling 特征项(统一量纲标准化)
# 拆分特征项和目标项 feature = data.iloc[:, 1:14] # 拆分出特征项 target = data.iloc[:, 0] # 拆分出目标项 # 实例化一个标准器 scaler = RobustScaler() # 将数据标准化 feature = scaler.fit_transform(feature) feature = pd.DataFrame(feature, columns=['CAr__', 'PET_a', 'PRCP_a', 'TMP_a', 'WET_a', 'G_ET_', 'G_PRCP_', 'G_Qs_', 'G_Qsb_', 'G_TWSC_', 'dem90mca', 'Lat', 'Long'])
看看(检查)量纲之后的部分数据
feature.head()
输出结果:
3.2 数据集划分
feature_train, feature_test, target_train, target_test = train_test_split(feature, target)
看看(检查)划分好的数据
feature.head()
输出结果:
3. RF模型训练
看到下面的注释了,那是我经过参数寻优之后的pycharm给出的最佳参数(训练了10h+),但是实际上我又用默认参数进行了模型训练,结果准确率还更高了点。我nm🐎离谱(算了)
# rf = RandomForestRegressor(max_depth=9, max_features=0.75, min_samples_leaf=2, # min_samples_split=3, n_estimators=500, n_jobs=-1) rf = RandomForestRegressor() """超参数筛选:RandomForestRegressor(max_depth=9, max_features=0.75, min_samples_leaf=2, min_samples_split=3, n_estimators=500, n_jobs=-1)""" rf.fit(feature_train, target_train) print(rf.score(feature_test, target_test))
输出结果(可以发现准确率0.9843...还是比较高的):
这是我参数寻优的代码:
当然,你在训练完成(上面的rf.fit()之后加上保存模型的代码)之后可以保存你的训练模型.这里简单给下我的。
4. 模型评估
4.1 包(当然你可以在最前面把包都给调好)
import joblib # 加载模型 from sklearn.metrics import explained_variance_score # 回归方差(可解释方差) from sklearn.metrics import mean_absolute_error # 平均绝对误差 MAE from sklearn.metrics import mean_squared_error # 均方误差 MSE 好像没有均方根误差 RMSE from sklearn.metrics import mean_squared_log_error # 均值平方对数误差(MSLE, Mean Squared Logarithmic Error) from sklearn.metrics import median_absolute_error # 中值绝对误差 from sklearn.metrics import r2_score # R平方值 from sklearn.preprocessing import RobustScaler # 标准器 from sklearn.model_selection import train_test_split # 数据集划分 import pandas as pd import numpy as np
4.2 得到相关评估标准
# 预测 target_pre = rf.predict(feature_test) # 模型评估 score = rf.score(feature_test, target_test) print('预测的正确率:\n', score) EVS = explained_variance_score(target_test, target_pre) print('回归方差:\n', EVS) MSE = mean_squared_error(target_test, target_pre) print('均方误差:\n', MSE) MAE = mean_absolute_error(target_test, target_pre) print('平均绝对值误差:\n', MAE) MSLE = mean_squared_log_error(target_test, target_pre) print('均值平方对数误差:\n', MSLE) median = median_absolute_error(target_test, target_pre) print('中值绝对误差:\n', median) r2 = r2_score(target_test, target_pre) print('R决定系数:\n', r2) # print('最优模型:\n', rf.best_estimator_) # print('最优参数:\n', rf.best_params_) print('各特征值权重:\n', rf.feature_importances_)
输出结果:
5. 题外话 (其它模型:KNN\线性回归\DT\RF\Bagging(DT)\GBDT\Adaboost\Xgboost\Voting投票器)
5.1 包
from sklearn.neighbors import KNeighborsRegressor # K邻近模型 from sklearn.linear_model import LinearRegression # 线性回归模型 from sklearn.tree import DecisionTreeRegressor # 决策树模型 from sklearn.ensemble import RandomForestRegressor # 随机森林模型 from sklearn.ensemble import BaggingRegressor # bagging模型 from sklearn.ensemble import GradientBoostingRegressor # 梯度决策树模型/GBDT from sklearn.ensemble import AdaBoostRegressor # Ada模型 from xgboost import XGBRFRegressor # Xgboost模型 from sklearn.ensemble import VotingRegressor # 投票器
5.2 模型训练
5.2.1 准备工作
# 传统模型 knn = KNeighborsRegressor() lin = LinearRegression() log = LogisticRegression() dt = DecisionTreeRegressor() # bagging模型 rf = RandomForestRegressor() bag = BaggingRegressor() # boost模型 gbdt = GradientBoostingRegressor(loss='squared_error') ada_dt = AdaBoostRegressor(base_estimator=dt) ada_lin = AdaBoostRegressor(base_estimator=lin) xg = XGBRFRegressor() # 投票器 knn_reg = KNeighborsRegressor() dt_reg = DecisionTreeRegressor() rf_reg = RandomForestRegressor() ada_dt_reg = AdaBoostRegressor(base_estimator=dt) vote = VotingRegressor(estimators=[('rf_reg', rf_reg), ('ada_dt_reg', ada_dt_reg)]) # stacking模型 stack1_knn = KNeighborsRegressor() stack1_dt = DecisionTreeRegressor() stack1_rf = RandomForestRegressor() stack1_bag = BaggingRegressor() stack1_gbdt = GradientBoostingRegressor(loss='squared_error') stack1_ada_dt = AdaBoostRegressor(base_estimator=dt) stack1_xg = XGBRFRegressor() stack_model = [stack1_knn, stack1_dt, stack1_rf, stack1_bag, stack1_gbdt, stack1_ada_dt, stack1_xg] stack2_rf = RandomForestRegressor(n_estimators=500, oob_score=True) model = [knn, lin, dt, rf, bag, gbdt, ada_dt, ada_lin, xg, vote, stack2_rf]
model_label = ['knn', 'lin', 'dt', 'rf', 'bag', 'gbdt', 'ada_dt', 'ada_lin', 'xg', 'vote', 'stack2_rf'] assess_label = ['准确率(score)', '回归方差(EVS)', '均方误差(MSE)', '平均绝对值误差(MAE)', '中值绝对误差', 'R2决定系数(R2)'] comparion = pd.DataFrame(index=model_label, columns=assess_label)
展示一下:
comparion
输出结果(还没开始往里面输入,所以均为NaN):
5.2.2 开始各个模型的训练
这里我单独输出了RF的各个特征项权重的图以及一些其它信息。
另外由于stacking模型的特殊性,我没有将其的各个评估标准输出到comparion中,因为这是没有意义的。而只是输出了它的准确率(它的准确率几乎是最高的)
plt.rcParams['font.sans-serif'] = ['SimHei'] # 用来正常显示中文标签 plt.rcParams['axes.unicode_minus'] = False # 用来正常显示负号 for estimator_index, estimator in enumerate(model): if estimator is knn: estimator.fit(feature_train, target_train.astype('int')) elif estimator is stack2_rf: # 训练模型 feature_train1, feature_test1, target_train1,target_test1 = train_test_split(feature_train, target_train) for stack_estimator in stack_model: stack_estimator.fit(feature_train, target_train) # 创造预测值存储器 target_stack_pre = np.empty((len(feature_test), len(stack_model)), dtype=np.float64) # 预测 for stack_estimator_index, stack_estimator in enumerate(stack_model): target_stack_pre[:, stack_estimator_index] = stack_estimator.predict(feature_test) # 第二层模型训练 estimator.fit(target_stack_pre, target_test) print('Stacking模型score:\t', estimator.oob_score_) break else: estimator.fit(feature_train, target_train) print(estimator_index) # 预测target target_pre = estimator.predict(feature_test) # 准确率 score = estimator.score(feature_test, target_test) # 回归方差 EVS = explained_variance_score(target_test, target_pre) # 均方误差 MSE = mean_squared_error(target_test, target_pre) # 均方绝对值误差 MAE = mean_absolute_error(target_test, target_pre) # 均值平均对数误差 # MSLE = mean_squared_log_error(target_test, target_pre) # 中值绝对值误差 median = median_absolute_error(target_test, target_pre) # R2决定系数 r2 = r2_score(target_test, target_pre) # 集合 # assess = [score, EVS, MSE, MAE, MSLE, median, r2] assess = [score, EVS, MSE, MAE, median, r2] if estimator is rf: fuck = estimator.feature_importances_ print(data_cols) print(fuck) fg, ax = plt.subplots() sns.barplot(x=data_cols, y=fuck) ax.set_title('随机森林模型下的特征项重要性') plt.show() # chart for model_assess_index, model_assess in enumerate(assess): comparion.iloc[estimator_index, model_assess_index] = model_assess
输出结果:
5.2.3 输出结果展示
comparion = comparion.transpose() comparion.transpose()
输出结果:
还有一些丑的要命的图:
plt.rcParams['font.sans-serif'] = ['SimHei'] # 用来正常显示中文标签 plt.rcParams['axes.unicode_minus'] = False # 用来正常显示负号 for i in range(8): plt.figure(figsize=(20., 8), dpi=100) sns.barplot(x=comparion.columns, y=comparion.iloc[i, :], palette='Accent') plt.show()
输出结果(不再细致展示了):
如果有问题,欢迎一起探讨.