Chp7-3
2019 年 12 月 23 日
In [21]: import pandas as pd import numpy as np from scipy import stats from matplotlib import pyplot as plt my_data = pd.read_csv("C:\Python\Scripts\my_data\german_credit_data_dataset.csv ")#,dtype=str) print(my_data.info()) <class 'pandas.core.frame.DataFrame'> RangeIndex: 1000 entries, 0 to 999 Data columns (total 21 columns): checking_account_status 1000 non-null object duration 1000 non-null int64 credit_history 1000 non-null object purpose 1000 non-null object credit_amount 1000 non-null float64 savings 1000 non-null object present_employment 1000 non-null object installment_rate 1000 non-null float64 personal 1000 non-null object other_debtors 1000 non-null object present_residence 1000 non-null float64 property 1000 non-null object age 1000 non-null float64 other_installment_plans 1000 non-null object housing 1000 non-null object existing_credits 1000 non-null float64 job 1000 non-null object dependents 1000 non-null int64 telephone 1000 non-null object foreign_worker 1000 non-null object customer_type 1000 non-null int64 dtypes: float64(5), int64(3), object(13) memory usage: 164.1+ KB None In [52]: from sklearn.model_selection import train_test_split from sklearn.tree import DecisionTreeClassifier feature_col=['checking_account_status','personal'] X=my_data[['customer_type','credit_amount']] # for n,my_str in enumerate(feature_col): my_dummy=pd.get_dummies(my_data[[my_str]],prefix=my_str) X=pd.concat([X,my_dummy],axis=1) XX_feature=['credit_amount','checking_account_status_A14','personal_A91', 'personal_A92','personal_A93','personal_A94'] XX=X[XX_feature] Y=X['customer_type'] X_train,X_test,Y_train,Y_test=train_test_split(XX,Y,test_size=0.2,random_state=0) my_tree=DecisionTreeClassifier(max_depth=3) my_tree.fit(X_train,Y_train) print('分类结果为: ',my_tree.predict(X_test),'\n') print('平均准确率为: ',my_tree.score(X_test,Y_test)) 分类结果为: [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 2 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1] 平均准确率为: 0.71 In [54]: pd.DataFrame({'feature':XX.columns,'importance':my_tree.feature_importances_}) Out[54]: feature importance 0 credit_amount 0.314532 1 checking_account_status_A14 0.671787 2 personal_A91 0.013680 3 personal_A92 0.000000 4 personal_A93 0.000000 5 personal_A94 0.000000 In [55]: from sklearn import tree import matplotlib.pyplot as plt plt.figure(figsize=(18,12)) tree.plot_tree(my_tree,fontsize=12,feature_names=XX.columns,class_names=['Good','Bad ']) plt.savefig('my_tree')