机器学习
使用机器学习方法,首先需要特征数据以及指标数据。
在本文中,基于时间序列数据构造特征数据如下:
- 特征数据1:滞后数据。选择 7 天前的
demand
数据作为特征数据。 - 特征数据2:移动平均数据。选择 7 天前至 14 天之前的
demand
移动平均值数据作为特征数据。 - 特征数据3:月销售均值
- 特征数据4:每月销售最大值
- 特征数据5:每月销售最小值
- 特征数据6:每月销售最大值与最小值的差值
- 特征数据7:每周销售均值
- 特征数据8:每周销售最大值
- 特征数据9:每周销售中值
具体代码如下:
def lags_windows(df): lags = [7] lag_cols = ["lag_{}".format(lag) for lag in lags ] for lag, lag_col in zip(lags, lag_cols): df[lag_col] = df[["id","demand"]].groupby("id")["demand"].shift(lag) wins = [7] for win in wins : for lag,lag_col in zip(lags, lag_cols): df["rmean_{}_{}".format(lag,win)] = df[["id", lag_col]].groupby("id")[lag_col].transform(lambda x : x.rolling(win).mean()) return df def per_timeframe_stats(df, col): #For each item compute its mean and other descriptive statistics for each month and dayofweek in the dataset months = df['month'].unique().tolist() for y in months: df.loc[df['month'] == y, col+'_month_mean'] = df.loc[df['month'] == y].groupby(['id'])[col].transform(lambda x: x.mean()).astype("float32") df.loc[df['month'] == y, col+'_month_max'] = df.loc[df['month'] == y].groupby(['id'])[col].transform(lambda x: x.max()).astype("float32") df.loc[df['month'] == y, col+'_month_min'] = df.loc[df['month'] == y].groupby(['id'])[col].transform(lambda x: x.min()).astype("float32") df[col + 'month_max_to_min_diff'] = (df[col + '_month_max'] - df[col + '_month_min']).astype("float32") dayofweek = df['dayofweek'].unique().tolist() for y in dayofweek: df.loc[df['dayofweek'] == y, col+'_dayofweek_mean'] = df.loc[df['dayofweek'] == y].groupby(['id'])[col].transform(lambda x: x.mean()).astype("float32") df.loc[df['dayofweek'] == y, col+'_dayofweek_median'] = df.loc[df['dayofweek'] == y].groupby(['id'])[col].transform(lambda x: x.median()).astype("float32") df.loc[df['dayofweek'] == y, col+'_dayofweek_max'] = df.loc[df['dayofweek'] == y].groupby(['id'])[col].transform(lambda x: x.max()).astype("float32") return df def feat_eng(df): df = lags_windows(df) df = per_timeframe_stats(df,'demand') return df
准备数据:
data = pd.read_csv('data_for_tsa.csv') data['date'] = pd.to_datetime(data['date']) train = data[data['date'] <= '2016-03-27'] test = data[(data['date'] > '2016-03-11') & (data['date'] <= '2016-04-24')] data_ml = feat_eng(train) data_ml = data_ml.dropna() useless_cols = ['id','item_id','dept_id','cat_id','store_id','state_id','demand','date','demand_month_min'] linreg_train_cols = ['sell_price','year','month','dayofweek','lag_7','rmean_7_7'] #use different columns for linear regression lgb_train_cols = data_ml.columns[~data_ml.columns.isin(useless_cols)] X_train = data_ml[lgb_train_cols].copy() y_train = data_ml["demand"]
模型拟合
通过 light gradient boosting、linear regression、random forest 三种方法对数据进行拟合:
#Fit Light Gradient Boosting t0 = time.time() lgb_params = { "objective" : "poisson", "metric" :"rmse", "force_row_wise" : True, "learning_rate" : 0.075, "sub_row" : 0.75, "bagging_freq" : 1, "lambda_l2" : 0.1, 'verbosity': 1, 'num_iterations' : 2000, 'num_leaves': 128, "min_data_in_leaf": 50, } np.random.seed(777) fake_valid_inds = np.random.choice(X_train.index.values, 365, replace = False) train_inds = np.setdiff1d(X_train.index.values, fake_valid_inds) train_data = lgb.Dataset(X_train.loc[train_inds] , label = y_train.loc[train_inds], free_raw_data=False) fake_valid_data = lgb.Dataset(X_train.loc[fake_valid_inds], label = y_train.loc[fake_valid_inds],free_raw_data=False) m_lgb = lgb.train(lgb_params, train_data, valid_sets = [fake_valid_data], verbose_eval=0) t_lgb = time.time()-t0 #Fit Linear Regression t0 = time.time() m_linreg = LinearRegression().fit(X_train[linreg_train_cols].loc[train_inds], y_train.loc[train_inds]) t_linreg = time.time()-t0 #Fit Random Forest t0 = time.time() m_rf = RandomForestRegressor(n_estimators=100,max_depth=5, random_state=26, n_jobs=-1).fit(X_train.loc[train_inds], y_train.loc[train_inds]) t_rf = time.time()-t0
模型预测
值得注意的是,在训练阶段,我们使用了7 天前的 demand 数据以及 7 天前至 14 天之前的 demand 移动平均值数据作为特征数据。但是在预测阶段,是没有 demand 数据的。因此这里需要借助滑动窗口,sliding window,的概念,也就是每次计算一个预测数据。为了计算移动平均值数据,设置滑动窗口长度为 15。
通过滑动窗口方法预测未知数据的具体代码如下:
fday = datetime(2016,3, 28) max_lags = 15 for tdelta in range(0, 28): day = fday + timedelta(days=tdelta) tst = test[(test.date >= day - timedelta(days=max_lags)) & (test.date <= day)].copy() tst = feat_eng(tst) tst_lgb = tst.loc[tst.date == day , lgb_train_cols].copy() test.loc[test.date == day, "preds_LightGB"] = m_lgb.predict(tst_lgb) tst_rf = tst.loc[tst.date == day , lgb_train_cols].copy() tst_rf = tst_rf.fillna(0) test.loc[test.date == day, "preds_RandomForest"] = m_rf.predict(tst_rf) tst_linreg = tst.loc[tst.date == day , linreg_train_cols].copy() tst_linreg = tst_linreg.fillna(0) test.loc[test.date == day, "preds_LinearReg"] = m_linreg.predict(tst_linreg) test_final = test.loc[test.date >= fday]
Light Gradient Boosting
model_name='LightGB' predictions[model_name] = test_final["preds_"+model_name] #visualize fig, ax = plt.subplots(figsize=(25,4)) train[-28:].plot(x='date',y='demand',label='Train',ax=ax) test_final.plot(x='date',y='demand',label='Test',ax=ax); predictions.plot(x='date',y=model_name,label=model_name,ax=ax); #evaluate score = np.sqrt(mean_squared_error(predictions[model_name].values, test_final['demand'])) print('RMSE for {}: {:.4f}'.format(model_name,score)) stats = stats.append({'Model Name':model_name, 'Execution Time':t_lgb, 'RMSE':score},ignore_index=True)
Light Gradient Boosting 预测结果
Random Forest
model_name='RandomForest' predictions[model_name] = test_final["preds_"+model_name] #visualize fig, ax = plt.subplots(figsize=(25,4)) train[-28:].plot(x='date',y='demand',label='Train',ax=ax) test_final.plot(x='date',y='demand',label='Test',ax=ax); predictions.plot(x='date',y=model_name,label=model_name,ax=ax); #evaluate score = np.sqrt(mean_squared_error(predictions[model_name].values, test_final['demand'])) print('RMSE for {}: {:.4f}'.format(model_name,score)) stats = stats.append({'Model Name':model_name, 'Execution Time':t_lgb, 'RMSE':score},ignore_index=True)
Random Forest 预测结果
Linear Regression
model_name='LinearReg' predictions[model_name] = test_final["preds_"+model_name] #visualize fig, ax = plt.subplots(figsize=(25,4)) train[-28:].plot(x='date',y='demand',label='Train',ax=ax) test_final.plot(x='date',y='demand',label='Test',ax=ax); predictions.plot(x='date',y=model_name,label=model_name,ax=ax); #evaluate score = np.sqrt(mean_squared_error(predictions[model_name].values, test_final['demand'])) print('RMSE for {}: {:.4f}'.format(model_name,score)) stats = stats.append({'Model Name':model_name, 'Execution Time':t_linreg, 'RMSE':score},ignore_index=True)
Linear Regression 预测结果
以上就是所有的预测方法及过程。各个方法的运算时间及结果误差如下:
stats.sort_values(by='RMSE') stats.plot(kind='bar',x='Model Name', y='RMSE', figsize=(12,6), title="Model RMSE Comparison - Lower is better");
各个方法的运算时间及结果误差对比
各个方法的结果误差对比
可以看出,传统预测方法的性能相较于机器学习预测方法较差。
但是这个结论并不是绝对的。方法的准确度取决于不同的问题背景。机器学习方法依赖于特征数据。如果我们只有时间序列数据,那么特征数据较为缺乏,我们可以基于原始数据创建特征数据,如滞后数据、移动平均数据等。因此机器学习方法要呈现更好地预测结果,特征工程至关重要。在机器学习领域,某种程度上,数据才是起决定作用,而不是模型或者算法。