Chp6-4
In [1]: import pandas as pd import numpy as np from matplotlib import pyplot as plt table=pd.DataFrame({'prob':[0.01,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,0.99]}) table['odds']=table['prob']/(1-table['prob']) table['log-odds']=np.log(table['odds']) table Out[1]: prob odds log-odds 0 0.01 0.010101 -4.595120 1 0.10 0.111111 -2.197225 2 0.20 0.250000 -1.386294 3 0.30 0.428571 -0.847298 4 0.40 0.666667 -0.405465 5 0.50 1.000000 0.000000 6 0.60 1.500000 0.405465 7 0.70 2.333333 0.847298 8 0.80 4.000000 1.386294 9 0.90 9.000000 2.197225 10 0.99 99.000000 4.595120 In [2]: #plt.subplot(2,2,1) plt.plot(table['prob'],'g') plt.plot(table['odds'],'y') plt.plot(table['log-odds'],'m') plt.legend({'probability','Odds','log_odds'}) 1plt.ylim([-6,6]) Out[2]: (-6, 6)
In [6]: import pandas as pd import numpy as np import matplotlib.pyplot as plt from sklearn.linear_model import LogisticRegression from sklearn import metrics from scipy import stats from sklearn.model_selection import train_test_split bikes=pd.read_csv("C:\Python\Scripts\my_data\\bikeshare.csv") # 注意\b 是转义字符,表示退格,所以\\表示\本身 print(bikes.shape) (10886, 13) In [30]: bikes.head() Out[30]: Unnamed: 0 datetime season holiday workingday weather \ 0 0 2011-01-01 00:00:00 1 0 0 1 1 1 2011-01-01 01:00:00 1 0 0 1 2 2 2011-01-01 02:00:00 1 0 0 1 3 3 2011-01-01 03:00:00 1 0 0 1 4 4 2011-01-01 04:00:00 1 0 0 1 temp atemp humidity windspeed casual registered count 0 9.84 14.395 81 0.0 3 13 16 1 9.02 13.635 80 0.0 8 32 40 2 9.02 13.635 80 0.0 5 27 32 3 9.84 14.395 75 0.0 3 10 13 4 9.84 14.395 75 0.0 0 1 1 In [18]: feature_cols=['temp'] x=bikes[feature_cols] bikes['above_average']=bikes['count']>=bikes['count'].mean() y=bikes['count']>=bikes['count'].mean() x_train,x_test,y_train,y_test=train_test_split(x,y) logreg=LogisticRegression() logreg.fit(x_train,y_train) #print((y_test.values)) print(pd.DataFrame(np.transpose([y_test.values,logreg.predict(x_test)]), columns={'真实值','预测值'})) print('\n') print('分类准确率是:',logreg.score(x_test,y_test)) # 评分函数 真实值 预测值 0 False False 1 True False 2 True True 3 False False 4 False False 5 True True 6 False False 7 True False 8 False False 9 True False 10 True False 11 True True 12 False False 13 False True 14 False False 15 True False 16 True False 17 False False 18 True False 19 False False 20 False False 21 False False 22 True False 23 True True 24 True False 25 False False 26 False True 27 True True 28 False False 29 False False ... ... ... 2692 False False 2693 False True 2694 False False 2695 True True 2696 True False 2697 True True 2698 True True 2699 False False 2700 False True 2701 True False 2702 False False 2703 True False 2704 False True 2705 False False 2706 True False 2707 False False 2708 False True 2709 False False 2710 False False 2711 False False 2712 True True 2713 False True 2714 True True 2715 False False 2716 False False 2717 True False 2718 False False 2719 False False 2720 True True 2721 True False [2722 rows x 2 columns] 分类准确率是: 0.6649522409992652 In [19]: bikes.groupby('season').above_average.mean().plot(kind='bar') when_dummies=pd.get_dummies(bikes['season'],prefix='season_') when_dummies.head() Out[19]: season__1 season__2 season__3 season__4 0 1 0 0 0 1 1 0 0 0 2 1 0 0 0 3 1 0 0 0 4 1 0 0 0
In [20]: when_dummies=when_dummies.iloc[:,1:] # 去除第一列 when_dummies.head() #new_bike=pd.concat([bikes[['temp','humidity']],when_dummies],axis=1) new_bike=pd.concat([bikes[['temp']],when_dummies],axis=1) x=new_bike x_train,x_test,y_train,y_test=train_test_split(x,y) logreg=LogisticRegression() logreg.fit(x_train,y_train) y_pred=logreg.predict(x_test) #print(y_pred) print('用气温、季节同时作为预测自变量,预测的准确率为: ',logreg.score(x_test,y_test)) 用气温、季节同时作为预测自变量,预测的准确率为: 0.6958119030124909