2.6 新数据来了,只增大数据量的话,结果会提升吗
from sklearn.ensemble import RandomForestRegressor # 剔除掉新的特征,保证数据特征是一致的 original_train_features = train_features[:,original_feature_indices] original_test_features = test_features[:, original_feature_indices] rf = RandomForestRegressor(n_estimators= 100 ,random_state=0) rf.fit(original_train_features, train_labels); # 预测 baseline_predictions = rf.predict(original_test_features) # 结果 baseline_errors = abs(baseline_predictions - test_labels) print('平均温度误差:', round(np.mean(baseline_errors), 2), 'degrees.') # (MAPE) baseline_mape = 100 * np.mean((baseline_errors / test_labels)) # accuracy baseline_accuracy = 100 - baseline_mape print('Accuracy:', round(baseline_accuracy, 2), '%.')
2.7 加入新特征再来看一看
# 准备加入新的特征 from sklearn.ensemble import RandomForestRegressor rf_exp = RandomForestRegressor(n_estimators= 100, random_state=0) rf_exp.fit(train_features, train_labels) # 同样的测试集 predictions = rf_exp.predict(test_features) # 评估 errors = abs(predictions - test_labels) print('平均温度误差:', round(np.mean(errors), 2), 'degrees.') # (MAPE) mape = np.mean(100 * (errors / test_labels)) # 看一下提升了多少 improvement_baseline = 100 * abs(mape - baseline_mape) / baseline_mape print('特征增多后模型效果提升:', round(improvement_baseline, 2), '%.') # accuracy accuracy = 100 - mape print('Accuracy:', round(accuracy, 2), '%.')
模型整体效果有了略微提升,这里我们还加入一项额外的评估就是模型跟基础模型相比提升的大小,方便来进行对比观察。这回特征也多了,我们可以好好研究下特征重要性这个指标了,虽说其只供参考,但是业界也有一些不成文的行规我们来看一下:
2.8 特征重要性
# 特征名字 importances = list(rf_exp.feature_importances_) # 名字,数值组合在一起 feature_importances = [(feature, round(importance, 2)) for feature, importance in zip(feature_list, importances)] # 排序 feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True) # 打印出来 [print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances]; # 指定风格 plt.style.use('fivethirtyeight') # 指定位置 x_values = list(range(len(importances))) # 绘图 plt.bar(x_values, importances, orientation = 'vertical', color = 'r', edgecolor = 'k', linewidth = 1.2) # x轴名字得竖着写 plt.xticks(x_values, feature_list, rotation='vertical') # 图名 plt.ylabel('Importance'); plt.xlabel('Variable'); plt.title('Variable Importances');
之前我们只是简单看了下载特征中哪些更重要,这回我们需要考虑的是特征的累加重要性,先把特征按照其重要性进行排序,再算起累计值,这里用到了cumsum()函数,比如cusm([1,2,3,4])得到的结果就是其累加值(1,3,6,10),通常我们都以95%为阈值,看看有多少个特征累加在一起之后,其特征重要性的累加值超过该阈值,就取它们当做筛选后的特征:
2.9 特征重要性累加,看看95%之前有多少个
# 对特征进行排序 sorted_importances = [importance[1] for importance in feature_importances] sorted_features = [importance[0] for importance in feature_importances] # 累计重要性 cumulative_importances = np.cumsum(sorted_importances) # 绘制折线图 plt.plot(x_values, cumulative_importances, 'g-') # 画一条红色虚线,0.95那 plt.hlines(y = 0.95, xmin=0, xmax=len(sorted_importances), color = 'r', linestyles = 'dashed') # X轴 plt.xticks(x_values, sorted_features, rotation = 'vertical') # Y轴和名字 plt.xlabel('Variable'); plt.ylabel('Cumulative Importance'); plt.title('Cumulative Importances'); #### 这里当第5个特征出现的时候,其总体的累加值超过了95%,那么接下来我们的对比实验又来了,如果只用这5个特征效果会怎么样呢?时间效率又会怎样呢? # 看看有几个特征 print('Number of features for 95% importance:', np.where(cumulative_importances > 0.95)[0][0] + 1) # 选择这些特征 important_feature_names = [feature[0] for feature in feature_importances[0:5]] # 找到它们的名字 important_indices = [feature_list.index(feature) for feature in important_feature_names] # 重新创建训练集 important_train_features = train_features[:, important_indices] important_test_features = test_features[:, important_indices] # 数据维度 print('Important train features shape:', important_train_features.shape) print('Important test features shape:', important_test_features.shape) # 再训练模型 rf_exp.fit(important_train_features, train_labels); # 同样的测试集 predictions = rf_exp.predict(important_test_features) # 评估结果 errors = abs(predictions - test_labels) print('平均温度误差:', round(np.mean(errors), 2), 'degrees.') mape = 100 * (errors / test_labels) # accuracy accuracy = 100 - np.mean(mape) print('Accuracy:', round(accuracy, 2), '%.')
效果反而下降了,其实随机森林的算法本身就会考虑特征的问题,会优先选择有价值的,我们认为的去掉一些,相当于可供候选的就少了,出现这样的现象在随机森林中并不奇怪!
2.10 计算下时间效率
# 要计算时间了 import time # 这次是用所有特征 all_features_time = [] # 算一次可能不太准,来10次取个平均 for _ in range(10): start_time = time.time() rf_exp.fit(train_features, train_labels) all_features_predictions = rf_exp.predict(test_features) end_time = time.time() all_features_time.append(end_time - start_time) all_features_time = np.mean(all_features_time) print('使用所有特征时建模与测试的平均时间消耗:', round(all_features_time, 2), '秒.')
当我们使用全部特征的时候,建模与测试用的总时间为0.5秒,这里会由于机器性能导致咱们的速度不一样,大家在笔记本中估计运行时间要比我的稍长一点。再来看看只选择高重要性特征的时间结果:
# 这次是用部分重要的特征 reduced_features_time = [] # 算一次可能不太准,来10次取个平均 for _ in range(10): start_time = time.time() rf_exp.fit(important_train_features, train_labels) reduced_features_predictions = rf_exp.predict(important_test_features) end_time = time.time() reduced_features_time.append(end_time - start_time) reduced_features_time = np.mean(reduced_features_time) print('使用所有特征时建模与测试的平均时间消耗:', round(reduced_features_time, 2), '秒.') # Accuracy vs Run-Time用分别的预测值来计算评估结果 all_accuracy = 100 * (1- np.mean(abs(all_features_predictions - test_labels) / test_labels)) reduced_accuracy = 100 * (1- np.mean(abs(reduced_features_predictions - test_labels) / test_labels)) #创建一个df来保存结果 comparison = pd.DataFrame({'features': ['all (17)', 'reduced (5)'], 'run_time': [round(all_features_time, 2), round(reduced_features_time, 2)], 'accuracy': [round(all_accuracy, 2), round(reduced_accuracy, 2)]}) comparison[['features', 'accuracy', 'run_time']] relative_accuracy_decrease = 100 * (all_accuracy - reduced_accuracy) / all_accuracy print('相对accuracy下降:', round(relative_accuracy_decrease, 3), '%.') relative_runtime_decrease = 100 * (all_features_time - reduced_features_time) / all_features_time print('相对时间效率提升:', round(relative_runtime_decrease, 3), '%.')
通常我们买东西都会考虑性价比,这里同样也是这个问题,时间效率的提升相对更大一些,而且基本保证了模型效果是差不多的。 最后让我们把所有的实验结果汇总到一起来进行对比吧:
# Pandas is used for data manipulation import pandas as pd # Read in data as pandas dataframe and display first 5 rows original_features = pd.read_csv('data/temps.csv') original_features = pd.get_dummies(original_features) # Use numpy to convert to arrays import numpy as np # Labels are the values we want to predict original_labels = np.array(original_features['actual']) # Remove the labels from the features # axis 1 refers to the columns original_features= original_features.drop('actual', axis = 1) # Saving feature names for later use original_feature_list = list(original_features.columns) # Convert to numpy array original_features = np.array(original_features) # Using Skicit-learn to split data into training and testing sets from sklearn.model_selection import train_test_split # Split the data into training and testing sets original_train_features, original_test_features, original_train_labels, original_test_labels = train_test_split(original_features, original_labels, test_size = 0.25, random_state = 42) # Find the original feature indices original_feature_indices = [feature_list.index(feature) for feature in feature_list if feature not in ['ws_1', 'prcp_1', 'snwd_1']] # Create a test set of the original features original_test_features = test_features[:, original_feature_indices] # Time to train on original data set (1 year) original_features_time = [] # Do 10 iterations and take average for all features for _ in range(10): start_time = time.time() rf.fit(original_train_features, original_train_labels) original_features_predictions = rf.predict(original_test_features) end_time = time.time() original_features_time.append(end_time - start_time) original_features_time = np.mean(original_features_time) # Calculate mean absolute error for each model original_mae = np.mean(abs(original_features_predictions - test_labels)) exp_all_mae = np.mean(abs(all_features_predictions - test_labels)) exp_reduced_mae = np.mean(abs(reduced_features_predictions - test_labels)) # Calculate accuracy for model trained on 1 year of data original_accuracy = 100 * (1 - np.mean(abs(original_features_predictions - test_labels) / test_labels)) # Create a dataframe for comparison model_comparison = pd.DataFrame({'model': ['original', 'exp_all', 'exp_reduced'], 'error (degrees)': [original_mae, exp_all_mae, exp_reduced_mae], 'accuracy': [original_accuracy, all_accuracy, reduced_accuracy], 'run_time (s)': [original_features_time, all_features_time, reduced_features_time]}) # Order the dataframe model_comparison = model_comparison[['model', 'error (degrees)', 'accuracy', 'run_time (s)']] model_comparison # 绘图来总结把 # 设置总体布局,还是一整行看起来好一些 fig, (ax1, ax2, ax3) = plt.subplots(nrows=1, ncols=3, figsize = (16,5), sharex = True) # X轴 x_values = [0, 1, 2] labels = list(model_comparison['model']) plt.xticks(x_values, labels) # 字体大小 fontdict = {'fontsize': 18} fontdict_yaxis = {'fontsize': 14} # 预测温度和真实温度差异对比 ax1.bar(x_values, model_comparison['error (degrees)'], color = ['b', 'r', 'g'], edgecolor = 'k', linewidth = 1.5) ax1.set_ylim(bottom = 3.5, top = 4.5) ax1.set_ylabel('Error (degrees) (F)', fontdict = fontdict_yaxis); ax1.set_title('Model Error Comparison', fontdict= fontdict) # Accuracy 对比 ax2.bar(x_values, model_comparison['accuracy'], color = ['b', 'r', 'g'], edgecolor = 'k', linewidth = 1.5) ax2.set_ylim(bottom = 92, top = 94) ax2.set_ylabel('Accuracy (%)', fontdict = fontdict_yaxis); ax2.set_title('Model Accuracy Comparison', fontdict= fontdict) # 时间效率对比 ax3.bar(x_values, model_comparison['run_time (s)'], color = ['b', 'r', 'g'], edgecolor = 'k', linewidth = 1.5) ax3.set_ylim(bottom = 0, top = 1) ax3.set_ylabel('Run Time (sec)', fontdict = fontdict_yaxis); ax3.set_title('Model Run-Time Comparison', fontdict= fontdict); # original代表是我们的老数据,也就是量少特征少的那份;exp_all代表我们的完整新数据;exp_reduced代表我们按照95%阈值选择的部分重要特征数据集。结果也是很明显的,数据量和特征越多,效果会提升一些,但是时间效率也会有所下降。
三、算法参数选择和调优
3.1 老数据
# Pandas is used for data manipulation import pandas as pd # Read in data as a dataframe features = pd.read_csv('data/temps_extended.csv') # One Hot Encoding features = pd.get_dummies(features) # Extract features and labels labels = features['actual'] features = features.drop('actual', axis = 1) # List of features for later use feature_list = list(features.columns) # Convert to numpy arrays import numpy as np features = np.array(features) labels = np.array(labels) # Training and Testing Sets from sklearn.model_selection import train_test_split train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size = 0.25, random_state = 42) print('Training Features Shape:', train_features.shape) print('Training Labels Shape:', train_labels.shape) print('Testing Features Shape:', test_features.shape) print('Testing Labels Shape:', test_labels.shape) print('{:0.1f} years of data in the training set'.format(train_features.shape[0] / 365.)) print('{:0.1f} years of data in the test set'.format(test_features.shape[0] / 365.))
3.2 还是选择那6个重要性比较高的特征
# Names of five importances accounting for 95% of total importance important_feature_names = ['temp_1', 'average', 'ws_1', 'temp_2', 'friend', 'year'] # Find the columns of the most important features important_indices = [feature_list.index(feature) for feature in important_feature_names] # Create training and testing sets with only the important features important_train_features = train_features[:, important_indices] important_test_features = test_features[:, important_indices] # Sanity check on operations print('Important train features shape:', important_train_features.shape) print('Important test features shape:', important_test_features.shape) # Use only the most important features train_features = important_train_features[:] test_features = important_test_features[:] # Update feature list for visualizations feature_list = important_feature_names[:] # 调节新参数 打印 from sklearn.ensemble import RandomForestRegressor rf = RandomForestRegressor(random_state = 42) from pprint import pprint # 打印所有参数 pprint(rf.get_params())
3.3 开始尝试各种参数
调参路漫漫,参数的可能组合结果实在太多了,我们还得有章可循,首先登场的是:RandomizedSearchCV(),这个函数可以帮助我们在候选集组合中,不断的随机选择一组合适的参数来建模,并且求其交叉验证后的评估结果。为什么要不断随机的选择呢?按顺序一个个来不是更靠谱嘛,假设咱们有5个参数待定,每个参数都有10种候选值,那一共有多少种可能性呢?这个数字就很大了吧,由于建立模型所花的时间并不少,当数据量大的时候,几小时能完成一次建模就已经不错了,所以我们很难遍历到所有的可能,随机变成了一种策略,让我们大致能得到比较合适的参数组合,该函数所需的所有参数解释都在API文档中有详细说明.
from sklearn.model_selection import RandomizedSearchCV # 建立树的个数 n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)] # 最大特征的选择方式 max_features = ['auto', 'sqrt'] # 树的最大深度 max_depth = [int(x) for x in np.linspace(10, 20, num = 2)] max_depth.append(None) # 节点最小分裂所需样本个数 min_samples_split = [2, 5, 10] # 叶子节点最小样本数,任何分裂不能让其子节点样本数少于此值 min_samples_leaf = [1, 2, 4] # 样本采样方法 bootstrap = [True, False] # Random grid random_grid = {'n_estimators': n_estimators, 'max_features': max_features, 'max_depth': max_depth, 'min_samples_split': min_samples_split, 'min_samples_leaf': min_samples_leaf, 'bootstrap': bootstrap}
在这个任务中,我们只给大家举例来进行说明,考虑到时间问题,所选的参数的候选值并没有给出太多。这里值得注意的是每一个候选参数的参数空间需要我们好好把控,因为如果这个取值范围给定的不恰当,最好的结果肯定也不会太好,这里可以参考一些经验值或者不断通过实验结果来改变参数空间,这是一个反复的过程,并不是说我们机器学习建模任务就是从前往后的进行,有了实验结果之后,都需要再回过头来反复来对比不同参数,不同预处理方案的。
# 随机选择最合适的参数组合 rf = RandomForestRegressor() rf_random = RandomizedSearchCV(estimator=rf, param_distributions=random_grid, n_iter = 100, scoring='neg_mean_absolute_error', cv = 3, verbose=2, random_state=42, n_jobs=-1) # 执行寻找操作 rf_random.fit(train_features, train_labels) rf_random.best_params_
这里给大家解释一下RandomizedSearchCV中常用的参数,其实在API文档中都给出了说明,还是建议大家养成这个查阅文档的习惯。
Estimator:RandomizedSearchCV这个方法是一个通用的,并不是专为随机森林设计的,所以我们需要指定选择的算法模型是什么。
Distributions:参数的候选空间,我们之间已经用字典格式给出了所需的参数分布。
n_iter:随机寻找参数组合的个数,比如在这里我们赋值了100代表接下来要随机找100组参数的组合,在其中找到最好的一个。
Scoring:评估方法,按照该方法去找到最好的参数组合
Cv:交叉验证,咱们之前已经唠过了。
Verbose:打印信息的数量,看自己的需求了。
random_state:随机种子,为了使得咱们的结果能够一致,排除掉随机成分的干扰,一般我们都会指定成一个值,用你自己的幸运数字就好。
n_jobs:多线程来跑这个程序,如果是-1就会用所有的,但是可能会有点卡。
即便我把n_jobs设置成了-1,程序运行的还是很慢,因为我们建立100次模型来选择参数,并且还是带有3折交叉验证的,那就相当于300个任务了,结果如下图所示:
3.4 评估函数
接下来就对比一下,经过调参后的结果和用默认参数结果的差异,所有默认参数在API中都有说明,比如n_estimators : integer, optional (default=10),这里就说明在随机森林模型中,默认要建立树的个数是10个。先给出评估标准:
def evaluate(model, test_features, test_labels): predictions = model.predict(test_features) errors = abs(predictions - test_labels) mape = 100 * np.mean(errors / test_labels) accuracy = 100 - mape print('平均气温误差.',np.mean(errors)) print('Accuracy = {:0.2f}%.'.format(accuracy)) # 老模型 base_model = RandomForestRegressor( random_state = 42) base_model.fit(train_features, train_labels) evaluate(base_model, test_features, test_labels) # 新配方(最好的参数) best_random = rf_random.best_estimator_ evaluate(best_random, test_features, test_labels)
3.5 Grid Search 再来微调
from sklearn.model_selection import GridSearchCV # 网络搜索 param_grid = { 'bootstrap': [True], 'max_depth': [8,10,12], 'max_features': ['auto'], 'min_samples_leaf': [2,3, 4, 5,6], 'min_samples_split': [3, 5, 7], 'n_estimators': [800, 900, 1000, 1200] } # 选择基本算法模型 rf = RandomForestRegressor() # 网络搜索 grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, scoring = 'neg_mean_absolute_error', cv = 3, n_jobs = -1, verbose = 2) # 执行搜索 grid_search.fit(train_features, train_labels) grid_search.best_params_ best_grid = grid_search.best_estimator_ evaluate(best_grid, test_features, test_labels)
3.6 另一组参赛选手 Grid Search
# 经过了再调整之后我们的算法模型效果又有了一点提升,虽然只是一小点,但是把每一小步累计在一次就是一个大成绩了。再用网络搜索的时候,遍历的次数太多,我们通常并不把所有的可能性都放进去,而是分成不同的组来分别执行,下面我们再来看看另外一组网络搜索的参赛选手: param_grid = { 'bootstrap': [True], 'max_depth': [12, 15, None], 'max_features': [3, 4,'auto'], 'min_samples_leaf': [5, 6, 7], 'min_samples_split': [7,10,13], 'n_estimators': [900, 1000, 1200] } # 选择算法模型 rf = RandomForestRegressor() # 继续寻找 grid_search_ad = GridSearchCV(estimator = rf, param_grid = param_grid, scoring = 'neg_mean_absolute_error', cv = 3, n_jobs = -1, verbose = 2) grid_search_ad.fit(train_features, train_labels) grid_search_ad.best_params_ best_grid_ad = grid_search_ad.best_estimator_ evaluate(best_grid_ad, test_features, test_labels)
3.7 最终模型
1. print('最终模型参数:\n') 2. pprint(best_grid_ad.get_params())
来总结一下我们的调参任务吧:
1.参数空间是非常重要的,它会对结果产生决定性影响,所以在开始任务之前,得选择大致一个合适区间,可以参考一些相同任务论文的经验值。
2.随机搜索可以更节约时间,尤其是在任务开始阶段,我们并不知道哪一个参数在哪一个位置效果能更好,这样我们可以把参数间隔设置的更大一些,先用随机搜索确定一些大致位置。
3.网络搜索相当于地毯式搜索了,当我们得到了大致位置之后,想在这里寻找到最优参数的时候就派上用场了,可以把随机和网络搜索当做一套组合拳,搭配使用。
4.最后调参的方法其实还有很多的,比如贝叶斯优化,这个还是蛮有意思的,跟大家简单说一下,想一想我们之前的调参方式,是不是每一个都是独立的进行不会对之后的结果产生任何影响,贝叶斯优化的基本思想在于每一个优化都是在不断积累经验,这样我会慢慢得到最终的解应当在的位置,相当于前一步结果会对后面产生影响了,如果大家对贝叶斯优化感兴趣,可以参考下Hyperopt工具包,用起来也很简便: