1.简答题
请打开:资料–课程所用数据一Incomregression.csv利用该csv文件中的数据,选择一种python编译器编写python程序,完成以下内容:
读取数据,并选择变量中类型为"float64" 的变量,对这些变量进行描述性分析( 10分)
2.对上述类型为"float64"的变量计算两两相关系数,列出相关系数矩阵( 10分)
3.用绘图程序(可以用matplotib或其他python第三方包)绘制MonthlyIncome, DebtRatio,RevolvingL tilizationOfUnsecuredl ines三个变量的3d散点图( 20分)
4.绘制Monthlyncome与DebtRatio,Monthlyincome与RevolvingL hizationOfUnsecuredl ines,
Monthlyincome与age,三幅2d散点图( 20分)
5.调用statsmodels模块,运用最小二乘法拟合线性回归模型,模型因变量为Monthlyincome自变量为age、
RevolvingUilzationOfUnsecuredl ines、DebtRatio,并提供所有拟合模型后的信息报告(20分)
6.调用scikitlearn模块,仍用回归分析方法拟合线性回归模型,模型因变量为Monthlyncome自变量为age、
RevolvingUtlzationOfUnsecuredl ines、DebtRatio,并进行5折交叉验证( 20分)
import pandas as pd
import numpy as np
df = pd.read_csv('Incomregression.csv',engine='python',dtype=np.float64)
df.describe()
df.corr()
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
fig =plt.figure().add_subplot(111, projection = '3d')
fig.scatter(df['MonthlyIncome'], df['DebtRatio'], df['RevolvingUtilizationOfUnsecuredLines'], c = 'r', marker = '^')
fig.set_xlabel('X Label')
fig.set_ylabel('Y Label')
fig.set_zlabel('Z Label')
plt.show()
import matplotlib.pyplot as plt
fig=plt.figure()
ax1=plt.subplot(3,1,1)
plt.plot(df['MonthlyIncome'],df['DebtRatio'])
ax1=plt.subplot(3,1,2)
plt.plot(df['MonthlyIncome'],df['RevolvingUtilizationOfUnsecuredLines'])
ax1=plt.subplot(3,1,3)
plt.plot(df['MonthlyIncome'],df['age'])
import statsmodels.formula.api as smf
formula = "MonthlyIncome ~ age + RevolvingUtilizationOfUnsecuredLines + DebtRatio"
model = smf.ols(formula, df)
results = model.fit()
print(results.summary())
from sklearn.linear_model import LinearRegression
from sklearn import linear_model
from sklearn.model_selection import train_test_split
x1=df[['age']]
x2=df[['RevolvingUtilizationOfUnsecuredLines']]
x3=df[['DebtRatio']]
y=df[['MonthlyIncome']]
x1_train,x1_test,y_train,y_test=train_test_split(x1,y,test_size=0.2,random_state=42)
x2_train,x2_test,y_train,y_test=train_test_split(x2,y,test_size=0.2,random_state=42)
x3_train,x3_test,y_train,y_test=train_test_split(x3,y,test_size=0.2,random_state=42)
model1 = LinearRegression()
model1.fit(x1_train, y_train)
print (model1.coef_)
print (model1.intercept_)
y_pred = model1.predict(x1_test)
model2 = LinearRegression()
model2.fit(x2_train, y_train)
print (model2.coef_)
print (model2.intercept_)
y_pred = model2.predict(x2_test)
model3 = LinearRegression()
model3.fit(x3_train, y_train)
print (model3.coef_)
print (model3.intercept_)
y_pred = model3.predict(x3_test)
from sklearn.model_selection import cross_val_predict
from sklearn import metrics
predicted=cross_val_predict(model1,x1,y,cv=5)
cross_mse=metrics.mean_squared_error(y,predicted)
cross_rmse=np.sqrt(metrics.mean_squared_error(y,predicted))
print('CROSS_MSE',cross_mse)
print('CROSS_RMSE',cross_rmse)
predicted=cross_val_predict(model2,x2,y,cv=5)
cross_mse=metrics.mean_squared_error(y,predicted)
cross_rmse=np.sqrt(metrics.mean_squared_error(y,predicted))
print('CROSS_MSE',cross_mse)
print('CROSS_RMSE',cross_rmse)
predicted=cross_val_predict(model3,x3,y,cv=5)
cross_mse=metrics.mean_squared_error(y,predicted)
cross_rmse=np.sqrt(metrics.mean_squared_error(y,predicted))
print('CROSS_MSE',cross_mse)
print('CROSS_RMSE',cross_rmse)