Chp8-4
2019 年 12 月 23 日
In [21]: import numpy as np import pandas as pd data1=np.random.rand(1000) #[0,1] 均匀分布的随机数 data2=np.random.rand(1000) data3=np.random.rand(1000) data4=np.random.rand(1000) data5=np.random.rand(1000) pd.DataFrame(data1).hist(bins=10) print('data1 的 1000 个数中,有',(data1>0.5).sum(),'个数据是大于 0.5 的') print('data1 的 1000 个数中,有',(data1>0.3).sum(),'个数据是大于 0.3 的') data1 的 1000 个数中,有 499 个数据是大于 0.5 的 data1 的 1000 个数中,有 713 个数据是大于 0.3 的
In [22]: # 大于 0.3 就预测 1,否则预测 0, 假设真实值全 1,则预测的 accuracy=0.7 model1=np.where(data1>0.3,1,0) model2=np.where(data2>0.3,1,0) model3=np.where(data3>0.3,1,0) model4=np.where(data4>0.3,1,0) model5=np.where(data5>0.3,1,0) # 均值数学上相当于预测 1 占所有样本的比例,相当于预测的 accuracy print('第一个模型的 accuracy 是: ',model1.mean()) print('第二个模型的 accuracy 是: ',model2.mean()) print('第三个模型的 accuracy 是: ',model3.mean()) print('第四个模型的 accuracy 是: ',model4.mean()) print('第五个模型的 accuracy 是: ',model5.mean()) 第一个模型的 accuracy 是: 0.713 第二个模型的 accuracy 是: 0.721 第三个模型的 accuracy 是: 0.687 第四个模型的 accuracy 是: 0.671 第五个模型的 accuracy 是: 0.72 In [11]: # 相当于 5 个预测模型累加平均 ensemble_preds=np.round((model1+model2+model3+model4+model5)/5.0).astype(int) print('集成模型的 accuracy 是: ',ensemble_preds.mean()) 集成模型的 accuracy 是: 0.839 In [23]: # 大于 0.3 就预测 1,否则预测 0, 假设真实值全 1,则预测的 accuracy=0.7 model1=np.where(data1>0.7,1,0) model2=np.where(data2>0.7,1,0) model3=np.where(data3>0.7,1,0) model4=np.where(data4>0.7,1,0) model5=np.where(data5>0.7,1,0) # 均值数学上相当于预测 1 占所有样本的比例,相当于预测的 accuracy print('第一个模型的 accuracy 是: ',model1.mean()) print('第二个模型的 accuracy 是: ',model2.mean()) print('第三个模型的 accuracy 是: ',model3.mean()) print('第四个模型的 accuracy 是: ',model4.mean()) print('第五个模型的 accuracy 是: ',model5.mean()) # 相当于 5 个预测模型累加平均 ensemble_preds=np.round((model1+model2+model3+model4+model5)/5.0).astype(int) print('集成模型的 accuracy 是: ',ensemble_preds.mean()) 第一个模型的 accuracy 是: 0.305 第二个模型的 accuracy 是: 0.319 第三个模型的 accuracy 是: 0.285 第四个模型的 accuracy 是: 0.291 第五个模型的 accuracy 是: 0.319 集成模型的 accuracy 是: 0.178 In [24]: # 大于 0.3 就预测 1,否则预测 0, 假设真实值全 1,则预测的 accuracy=0.7 model1=np.where(data1>0.7,1,0) model2=np.where(data2>0.3,1,0) model3=np.where(data3>0.6,1,0) model4=np.where(data4>0.2,1,0) model5=np.where(data5>0.5,1,0) # 均值数学上相当于预测 1 占所有样本的比例,相当于预测的 accuracy print('第一个模型的 accuracy 是: ',model1.mean()) print('第二个模型的 accuracy 是: ',model2.mean()) print('第三个模型的 accuracy 是: ',model3.mean()) print('第四个模型的 accuracy 是: ',model4.mean()) print('第五个模型的 accuracy 是: ',model5.mean()) # 相当于 5 个预测模型累加平均 ensemble_preds=np.round((model1+model2+model3+model4+model5)/5.0).astype(int) print('集成模型的 accuracy 是: ',ensemble_preds.mean()) 第一个模型的 accuracy 是: 0.305 第二个模型的 accuracy 是: 0.721 第三个模型的 accuracy 是: 0.383 第四个模型的 accuracy 是: 0.778 第五个模型的 accuracy 是: 0.512 集成模型的 accuracy 是: 0.571 In [1]: import pandas as pd import numpy as np from scipy import stats from matplotlib import pyplot as plt my_data = pd.read_csv("C:\Python\Scripts\my_data\german_credit_data_dataset.csv" )#,dtype=str) hah=my_data[['customer_type']]-1 print(hah.sum()) customer_type 300 dtype: int64 In [2]: #from sklearn.model_selection import train_test_split from sklearn.tree import DecisionTreeClassifier 4from sklearn.model_selection import cross_val_score from sklearn.ensemble import RandomForestClassifier feature_col=my_data.columns X=my_data[['duration']] # for n,my_str in enumerate(feature_col): if (my_str!='customer_type') & (my_str != 'duration'): if my_data[[my_str]]. dtypes[0]!=object: X=pd.concat([X,my_data[[my_str]]],axis=1) for n,my_str in enumerate(feature_col): if my_data[[my_str]].dtypes[0] == object: my_dummy=pd.get_dummies(my_data[[my_str]],prefix=my_str) X=pd.concat([X,my_dummy],axis=1) print(X.info()) <class 'pandas.core.frame.DataFrame'> RangeIndex: 1000 entries, 0 to 999 Data columns (total 61 columns): duration 1000 non-null int64 credit_amount 1000 non-null float64 installment_rate 1000 non-null float64 present_residence 1000 non-null float64 age 1000 non-null float64 existing_credits 1000 non-null float64 dependents 1000 non-null int64 checking_account_status_A11 1000 non-null uint8 checking_account_status_A12 1000 non-null uint8 checking_account_status_A13 1000 non-null uint8 checking_account_status_A14 1000 non-null uint8 credit_history_A30 1000 non-null uint8 credit_history_A31 1000 non-null uint8 credit_history_A32 1000 non-null uint8 credit_history_A33 1000 non-null uint8 credit_history_A34 1000 non-null uint8 purpose_A40 1000 non-null uint8 purpose_A41 1000 non-null uint8 purpose_A410 1000 non-null uint8 purpose_A42 1000 non-null uint8 purpose_A43 1000 non-null uint8 purpose_A44 1000 non-null uint8 purpose_A45 1000 non-null uint8 purpose_A46 1000 non-null uint8 purpose_A48 1000 non-null uint8 purpose_A49 1000 non-null uint8 savings_A61 1000 non-null uint8 savings_A62 1000 non-null uint8 savings_A63 1000 non-null uint8 savings_A64 1000 non-null uint8 savings_A65 1000 non-null uint8 present_employment_A71 1000 non-null uint8 present_employment_A72 1000 non-null uint8 present_employment_A73 1000 non-null uint8 present_employment_A74 1000 non-null uint8 present_employment_A75 1000 non-null uint8 personal_A91 1000 non-null uint8 personal_A92 1000 non-null uint8 personal_A93 1000 non-null uint8 personal_A94 1000 non-null uint8 other_debtors_A101 1000 non-null uint8 other_debtors_A102 1000 non-null uint8 other_debtors_A103 1000 non-null uint8 property_A121 1000 non-null uint8 property_A122 1000 non-null uint8 property_A123 1000 non-null uint8 property_A124 1000 non-null uint8 other_installment_plans_A141 1000 non-null uint8 other_installment_plans_A142 1000 non-null uint8 other_installment_plans_A143 1000 non-null uint8 housing_A151 1000 non-null uint8 housing_A152 1000 non-null uint8 housing_A153 1000 non-null uint8 job_A171 1000 non-null uint8 job_A172 1000 non-null uint8 job_A173 1000 non-null uint8 job_A174 1000 non-null uint8 telephone_A191 1000 non-null uint8 telephone_A192 1000 non-null uint8 foreign_worker_A201 1000 non-null uint8 foreign_worker_A202 1000 non-null uint8 dtypes: float64(5), int64(2), uint8(54) memory usage: 107.5 KB None In [3]: estimator_range=range(10,400,10) my_scores=[] for estimator in estimator_range: my_tree=RandomForestClassifier(n_estimators=estimator) accuracy_scores=cross_val_score(my_tree,X,my_data['customer_type'], cv=5,scoring='roc_auc') my_scores.append(accuracy_scores.mean()) In [4]: plt.plot(estimator_range,my_scores) plt.xlabel('the number of trees') plt.ylabel('ROC_AUC') Out[4]: Text(0,0.5,'ROC_AUC')
In [5]: my_RF=RandomForestClassifier(n_estimators=150) my_RF.fit(X,my_data['customer_type']) pd.DataFrame({'feature':X.columns, 'importance':my_RF.feature_importances_}).sort_values('importance', ascending=False) Out[5]: feature importance 1 credit_amount 0.102241 4 age 0.077722 0 duration 0.077652 10 checking_account_status_A14 0.047908 7 checking_account_status_A11 0.039347 3 present_residence 0.034465 2 installment_rate 0.033480 15 credit_history_A34 0.021523 26 savings_A61 0.019369 5 existing_credits 0.017395 8 checking_account_status_A12 0.017350 43 property_A121 0.017015 49 other_installment_plans_A143 0.016411 20 purpose_A43 0.016216 16 purpose_A40 0.016094 37 personal_A92 0.015763 55 job_A173 0.015198 33 present_employment_A73 0.015030 51 housing_A152 0.014993 13 credit_history_A32 0.014474 32 present_employment_A72 0.014454 30 savings_A65 0.014404 38 personal_A93 0.014346 19 purpose_A42 0.014143 56 job_A174 0.014014 45 property_A123 0.013968 44 property_A122 0.013528 47 other_installment_plans_A141 0.013023 57 telephone_A191 0.012869 35 present_employment_A75 0.012681 .. ... ... 12 credit_history_A31 0.011835 6 dependents 0.011585 11 credit_history_A30 0.011464 50 housing_A151 0.010970 46 property_A124 0.010738 54 job_A172 0.010724 34 present_employment_A74 0.010696 17 purpose_A41 0.009589 25 purpose_A49 0.009458 40 other_debtors_A101 0.009290 31 present_employment_A71 0.009159 14 credit_history_A33 0.008633 27 savings_A62 0.008608 23 purpose_A46 0.008341 39 personal_A94 0.008258 9 checking_account_status_A13 0.008238 48 other_installment_plans_A142 0.007623 42 other_debtors_A103 0.007613 36 personal_A91 0.007098 41 other_debtors_A102 0.006547 52 housing_A153 0.006518 28 savings_A63 0.005626 29 savings_A64 0.005101 22 purpose_A45 0.004062 60 foreign_worker_A202 0.003173 59 foreign_worker_A201 0.002837 53 job_A171 0.002301 21 purpose_A44 0.001803 18 purpose_A410 0.001542 24 purpose_A48 0.001007 [61 rows x 2 columns] In [6]: print(my_RF) RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini', max_depth=None, max_features='auto', max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, n_estimators=150, n_jobs=None, oob_score=False, random_state=None, verbose=0, warm_start=False)