您可以找到我用来从Jupyter Notebook中加载此项目的数据的代码。
create view cleaned_raw as select * from steam_3t_strat sts where title is not null and reviewcount is not null and originalprice is not null and afterdiscount is not null and originalprice not like '%ree%' and originalprice like '%.%'
delete from cleaned_raw where releasedate is null create view baseline as select platform, reviewcount, positivepercent , releasedate::date, cast(originalprice as double precision) , discountpercentage , alltags from cleaned_raw;
- 将列“platform” 二值化为“multi platform” (也即这一款游戏是否在多平台上发售);
- 在原始的“discount percentage”列基础上创建一个模型需要预测的目标列,其含义为0代表没有打折,1代表打折。
- 最后,从上一个项目中,我们知道“days_since_release”(游戏发售多久了)是最重要的特性之一,因此我们将从“release date”列进行特征工程,但这次是用SQL完成的。
现在,有了新视图“basic_fe”,我们就可以在Jupyter Notebook中进行特征工程与模型构建。
- ROC-AUC评分(模型有效性)
- F1分数(查准率和查全率之间的调和平均值)
basic_fe = pd.read_sql_query('''SELECT * FROM basic_fe''', cnx)
# copy the feature engineered view for exploratory purposes df = basic_fe.copy() # dropping unneeded columns df.drop(['platform', 'releasedate', 'alltags', 'discountpercentage'], axis=1, inplace = True) # cleaning NaN values df.fillna(0, inplace=True) # splitting data and target X, y = df.drop('onsale',axis=1), df['onsale'] # run baseline models (train-test split is conducted within this function) baseline_model = classification(X, y, {0: 1, 1: 15})
- 最近邻分类器(n_neighbors=5)
- 逻辑回归(C=0.95)
- 高斯型朴素贝叶斯分类器
- 支持向量机(gamma=’auto’,probability=True)
- 决策树(random_state=5)*
- 随机森林(random_state=5)
- 梯度提升机(n_estimators = 90, max_depth = 100)
1. 特征变换
def PolynomialFeatures_labeled(input_df,power): ''' Basically this is a cover for the sklearn preprocessing function. The problem with that function is if you give it a labeled dataframe, it ouputs an unlabeled dataframe with potentially a whole bunch of unlabeled columns. Inputs: input_df = Your labeled pandas dataframe (list of x's not raised to any power) power = what order polynomial you want variables up to. (use the same power as you want entered into pp.PolynomialFeatures(power) directly) Output: Output: This function relies on the powers_ matrix which is one of the preprocessing function's outputs to create logical labels and outputs a labeled pandas dataframe ''' poly = PolynomialFeatures(power) output_nparray = poly.fit_transform(input_df) powers_nparray = poly.powers_ input_feature_names = list(input_df.columns) target_feature_names = ["Constant Term"] for feature_distillation in powers_nparray[1:]: intermediary_label = "" final_label = "" for i in range(len(input_feature_names)): if feature_distillation[i] == 0: continue else: variable = input_feature_names[i] power = feature_distillation[i] intermediary_label = "%s^%d" % (variable,power) if final_label == "": #If the final label isn't yet specified final_label = intermediary_label else: final_label = final_label + " x " + intermediary_label target_feature_names.append(final_label) output_df = pd.DataFrame(output_nparray, columns = target_feature_names) return output_df 构建模型并返回分类评价指标: # poly transform the data explore_X_poly=PolynomialFeatures_labeled(X,2) # run modeling to see metrics classification(explore_X_poly, y, {0: 1, 1: 15})
2. 特征结合
def explore_fe(df, target): ''' A function to do exploratory feature engineering. It's flexible in its purpose, and is currently configured for this project only. Inputs: df (like X) = Your dataset without the target (y) target (like y) = Your target, whatever you are trying to predict ---Binary. Output: Returns engineered X (dataframe without target) based on the engineering logic. ''' df = df.astype(float) df = df.replace({0:1 , 1:2}) for i in range (0, len(df.columns)): df[f'{df.columns[i]}^2'] = np.square(df[df.columns[i]]) df[f'{df.columns[i]}^1/2'] = np.sqrt(df[df.columns[i]]) df[f'{df.columns[i]} * {df.columns[i+1]}'] = df[df.columns[i]] * df[df.columns[i+1]] df[f'{df.columns[i]} / {df.columns[i+1]}'] = df[df.columns[i]] / df[df.columns[i+1]] # df[f'{df.columns[i]} + {df.columns[i+1]}'] = df[df.columns[i]] + df[df.columns[i+1]] # df[f'{df.columns[i]} - {df.columns[i+1]}'] = df[df.columns[i]] - df[df.columns[i+1]] # df.fillna(0, inplace = True) # df.replace([np.inf, -np.inf], np.nan).dropna(axis=1) df[~df.isin([np.nan, np.inf, -np.inf]).any(1)].astype(np.float64) X,y= df, target X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state=4444) ran = RandomForestClassifier(random_state=5) ran.fit(X_train, y_train) print ('Accuracy: ', accuracy_score(y_test, ran.predict(X_test))) print("Precision: {:6.4f}, Recall: {:6.4f}, f1: {:6.4f}".format(precision_score(y_test, ran.predict(X_test)), recall_score(y_test, ran.predict(X_test)), f1_score(y_test, ran.predict(X_test))), '\n') k = list(X.columns) pp = pprint.PrettyPrinter(indent=4) pp.pprint(sorted(list(zip(k, ran.feature_importances_)), key=lambda x: x[1], reverse=True)) return df
3. 特征选择
def feature_selection(X, y, score_to_keep = 5): ''' A function to select features by votes of 6 models who can calculate feature importances. Also prints out how many original features there are, how many selected, and a list of selected features. Original idea from https://www.kaggle.com/mlwhiz/feature-selection-using-football-data Inputs: X = Your dataset without the target (y) y = Your target, whatever you are trying to predict --- Binary. score_to_keep = Pick features that have a 'score_to_keep' amount of votes --- max is 6 votes, default is 5. Output: Returns selected_X as a dataframe without target(y). ''' feature_name = list(X.columns) num_feats=len(X.columns) def cor_selector(X, y,num_feats): cor_list = [] feature_name = X.columns.tolist() # calculate the correlation with y for each feature for i in X.columns.tolist(): cor = np.corrcoef(X[i], y)[0, 1] cor_list.append(cor) # replace NaN with 0 cor_list = [0 if np.isnan(i) else i for i in cor_list] # feature name cor_feature = X.iloc[:,np.argsort(np.abs(cor_list))[-num_feats:]].columns.tolist() # feature selection? 0 for not select, 1 for select cor_support = [True if i in cor_feature else False for i in feature_name] return cor_support, cor_feature cor_support, cor_feature = cor_selector(X, y,num_feats) X_norm = MinMaxScaler().fit_transform(X) chi_selector = SelectKBest(chi2, k=num_feats) chi_selector.fit(X_norm, y) chi_support = chi_selector.get_support() chi_feature = X.loc[:,chi_support].columns.tolist() rfe_selector = RFE(estimator=LogisticRegression(), n_features_to_select=num_feats, step=10, verbose=5) rfe_selector.fit(X_norm, y) rfe_support = rfe_selector.get_support() rfe_feature = X.loc[:,rfe_support].columns.tolist() embeded_lr_selector = SelectFromModel(LogisticRegression(penalty="l2"), max_features=num_feats) embeded_lr_selector.fit(X_norm, y) embeded_lr_support = embeded_lr_selector.get_support() embeded_lr_feature = X.loc[:,embeded_lr_support].columns.tolist() embeded_rf_selector = SelectFromModel(RandomForestClassifier(n_estimators=100), max_features=num_feats) embeded_rf_selector.fit(X_norm, y) embeded_rf_support = embeded_rf_selector.get_support() embeded_rf_feature = X.loc[:,embeded_rf_support].columns.tolist() lgbc=LGBMClassifier(n_estimators=500, learning_rate=0.05, num_leaves=32, colsample_bytree=0.2, reg_alpha=3, reg_lambda=1, min_split_gain=0.01, min_child_weight=40) embeded_lgb_selector = SelectFromModel(lgbc, max_features=num_feats) embeded_lgb_selector.fit(X_norm, y) embeded_lgb_support = embeded_lgb_selector.get_support() embeded_lgb_feature = X.loc[:,embeded_lgb_support].columns.tolist() feature_selection_df = pd.DataFrame({'Feature':feature_name, 'Pearson':cor_support, 'Chi-2':chi_support, 'RFE':rfe_support, 'Logistics':embeded_lr_support, 'Random Forest':embeded_rf_support, 'LightGBM':embeded_lgb_support}) feature_selection_df['Total'] = np.sum(feature_selection_df, axis=1) feature_selection_df = feature_selection_df.sort_values(['Total','Feature'] , ascending=False) feature_selection_df.index = range(1, len(feature_selection_df)+1) selected_X = X.copy() to_drop = [] for i in range (0, len(feature_selection_df)): if feature_selection_df.Total.values[i] < score_to_keep: to_drop.append(feature_selection_df.Feature.values[i]) selected_X = selected_X.drop(to_drop, axis = 1) print ("Number of orginal features: ", num_feats) print ("Number of selected features: ", len(selected_X.columns), '\n') pp = pprint.PrettyPrinter(indent=4) print("Selected Features:") pp.pprint(list(selected_X.columns)) return selected_X
# Splitting the data again to make sure we are using the dataset that was fed into the best model X, y = exp_Xpoly_sel_5, y X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=42) X_val, y_val = X_test, y_test # explicitly calling this validation since we're using it for selection thresh_ps = np.linspace(.10,.50,1000) model_val_probs = best_model.predict_proba(X_val)[:,1] # positive class probs, same basic logistic model we fit in section 2 f1_scores, prec_scores, rec_scores, acc_scores = [], [], [], [] for p in thresh_ps: model_val_labels = model_val_probs >= p f1_scores.append(f1_score(y_val, model_val_labels)) prec_scores.append(precision_score(y_val, model_val_labels)) rec_scores.append(recall_score(y_val, model_val_labels)) acc_scores.append(accuracy_score(y_val, model_val_labels)) plt.plot(thresh_ps, f1_scores) plt.plot(thresh_ps, prec_scores) plt.plot(thresh_ps, rec_scores) plt.plot(thresh_ps, acc_scores) plt.title('Metric Scores vs. Positive Class Decision Probability Threshold') plt.legend(['F1','Precision','Recall','Accuracy']) plt.xlabel('P threshold') plt.ylabel('Metric score') best_f1_score = np.max(f1_scores) best_thresh_p = thresh_ps[np.argmax(f1_scores)] print('best_model best F1 score %.3f at prob decision threshold >= %.3f' % (best_f1_score, best_thresh_p))