1. 加载数据
import pandas as pd import matplotlib import matplotlib.pyplot as plt import numpy as np import seaborn as sns
关注公众号:阿旭算法与机器学习,回复:“ML30”即可获取本文数据集、源码与项目文档,欢迎共同学习交流
#读取数据 df = pd.read_csv('data.csv') df # data frame
2.特征工程
2.1 数据编码
# 把颜色独热编码 df_colors = df['Color'].str.get_dummies().add_prefix('Color: ') # 把类型独热编码 df_type = df['Type'].apply(str).str.get_dummies().add_prefix('Type: ') # 添加独热编码数据列 df = pd.concat([df, df_colors, df_type], axis=1) # 去除独热编码对应的原始列 df = df.drop(['Brand', 'Type', 'Color'], axis=1) df
# 特征热力图 matrix = df.corr() f, ax = plt.subplots(figsize=(8, 6)) sns.heatmap(matrix, square=True) plt.title('Car Price Variables') # 特征两两组合,观察相关性 sns.pairplot( df[['Construction Year', 'Days Until MOT', 'Odometer', 'Ask Price']],size=2) plt.show()
热图中,黑色表示负相关,白色表示正相关。通过解读以上两张图,选择相关性较大的特征,对模型进行训练。
3.构建模型进行训练与评估
3.1 构建模型并训练
# -*- coding: utf-8 -*- from sklearn.neighbors import KNeighborsRegressor from sklearn.model_selection import train_test_split from sklearn import preprocessing from sklearn.preprocessing import StandardScaler import matplotlib.pyplot as plt import numpy as np import pandas as pd df = pd.read_csv('D:/data.csv') df_colors = pd.get_dummies(df['Color'], prefix='Color:') df_type = pd.get_dummies(df['Type'].apply(str), prefix='Type:') df = pd.concat([df, df_colors, df_type], axis=1) df.drop(['Brand', 'Type', 'Color'], axis=1, inplace=True) # 准备样本数据,选取'Construction Year', 'Days Until MOT', 'Odometer'这样个特征进行模型训练 X = df[['Construction Year', 'Days Until MOT', 'Odometer']] y = df['Ask Price'].values.reshape(-1, 1) # 生成训练及测试数据 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=41) # 特征缩放--标准化 X_normalizer = StandardScaler() # N(0,1) X_train = X_normalizer.fit_transform(X_train) # 对训练特征数据进行拟合并归一 X_test = X_normalizer.transform(X_test) # 在对训练特征数据拟合之后应用到测试数据上 y_normalizer = StandardScaler() y_train = y_normalizer.fit_transform(y_train) y_test = y_normalizer.transform(y_test) # 训练模型,由于此处是回归问题,不用设置成奇数 knn = KNeighborsRegressor(n_neighbors=2) knn.fit(X_train, y_train.ravel()) # ravel 转换为一维数组 # 预测价格 y_pred = knn.predict(X_test) # 由于之前对标签数组做过归一化,所以现在需要反向归一,还原成真实值 y_pred_inv = y_normalizer.inverse_transform(y_pred) y_test_inv = y_normalizer.inverse_transform(y_test) # Build a plot plt.scatter(y_pred_inv, y_test_inv) plt.xlabel('Prediction') plt.ylabel('Real value') # Now add the perfect prediction line diagonal = np.linspace(500, 1500, 100) # linspace()指定的间隔内返回均匀间隔的数字 plt.plot(diagonal, diagonal, '-r') # -:直线 r:红色 plt.xlabel('Predicted ask price') plt.ylabel('Ask price') plt.show()
3.2 模型评估
knn
KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski', metric_params=None, n_jobs=None, n_neighbors=2, p=2, weights='uniform')
pred = knn.predict(X_test)
pred
array([ 1.36676513, 1.36676513, -0.68269804, 0.13462294])
from sklearn.metrics import mean_absolute_error mean_absolute_error(y_pred_inv, y_test_inv)
175.5
from sklearn.metrics import mean_squared_error mean_squared_error(y_pred_inv, y_test_inv)
56525.5
y_pred_inv
array([1199., 1199., 700., 899.])
y_test_inv
array([[1300.], [1650.], [ 650.], [ 799.]])