最后,请注意,父母的选择完全是随机的,而不是基于遗传算法中常见的“fitness”。
defoversample_crossover(X, y, rows_1, mode="single", knn=False, random_state=1): '''Oversampled positively labeled data using a crossoveroperation.Args:X: Array of explanatory variables to oversample data fromy: Array of labels associated with Xrows_1: Number of positively labled rows required (original + oversampled)mode: Choice between single-point ("single"), two-point ("two"), anduniform ("uniform") crossover operationknn: If set to True, drops oversampled data whose nearest neighbor is notpositively labeled.random_state: random state to pass to ensure reproducability of resultsReturns:X_crossover: Array of explanatory variables associated with the newoversampled data (includes original + new data)y_crossover: Labels associated with X_crossover'''np.random.seed(random_state) #PotentialbecauseiftheknnparameterissettoTrue, #thosesamplesneedtobecheckediftheirnearestneighbor#hasalabelof1potential_samples= [] X_positive=X[y==1] no_rows=X_positive.shape[0] no_cols=X_positive.shape[1] #assume%of1sisatleast3%, thisisrelevantifknn=Trueforiinrange(int(rows_1/0.03)): parent_1=np.random.randint(0, no_rows) parent_2=np.random.randint(0, no_rows) ifmode=="single": cross_point=np.random.randint(1, no_cols) mask=np.array([1ifcol_no<cross_pointelse0forcol_noinrange(no_cols)]) elifmode=="two": cross_point_1=np.random.randint(1, no_cols-1) cross_point_2=np.random.randint(cross_point_1, no_cols-1) mask=np.array([ 1ifcol_no<cross_point_1orcol_no>cross_point_2else0forcol_noinrange(no_cols)]) elifmode=="uniform": mask=np.random.randint(0, 2, no_cols) else: raiseValueError("Accebtable options for mode: single, two, uniform") potential_samples.append( (X_positive[parent_1] *mask) + (X_positive[parent_2] * (1-mask)) ) ifknn==False: X_crossover=potential_sampleselse: scaler=MinMaxScaler().fit(X) X_scaled=scaler.transform(X) potential_samples_scaled=scaler.transform(potential_samples) model=KNeighborsClassifier(n_neighbors=1) model.fit(X_scaled, y) knn_filter= (model.predict_proba( potential_samples_scaled)[:, 1] >0 ) X_crossover=np.array(potential_samples)[ knn_filter] required_rows=rows_1- (y==1).sum() X_crossover=np.vstack([X, X_crossover[:required_rows]]) y_crossover=np.hstack([ y, np.ones(required_rows)]) returnX_crossover, y_crossover
效果评估
最后,我们遍历30个随机状态,比较随机森林分类器在原始数据集上的性能以及11种过采样方法,
- 随机过采样
- SMOTE-1个邻居
- SMOTE-3个邻居
- SMOTE-5个邻居
- SMOTE-10个邻居
- 单点交叉
- 带KNN滤波器的单点交叉
- 两点交叉
- 带KNN滤波器的两点交叉
- 均匀交叉
- 带有KNN滤波器的均匀分频
我们还将研究7个分类指标:
- ROC AUC-ROC曲线下的面积
- PR AUC —精确召回曲线下的面积
- 平衡的准确性-这也等同于两个标签的平均召回率
- 最大F1 —使用最佳概率阈值可获得的最大F1分数
- 召回
- 精确
- F1
以下是代码和结果…
metrics_dict= {"ROC AUC": [], "PR AUC": [], "Balanced Accuracy": [], "Max F1": [], "Recall": [], "Precision": [], "F1": [] } results_original=deepcopy(metric_dict) results_random=deepcopy(metric_dict) results_smote_1=deepcopy(metric_dict) results_smote_3=deepcopy(metric_dict) results_smote_5=deepcopy(metric_dict) results_smote_10=deepcopy(metric_dict) results_singlex=deepcopy(metric_dict) results_singlex_knn=deepcopy(metric_dict) results_twox=deepcopy(metric_dict) results_twox_knn=deepcopy(metric_dict) results_uniformx=deepcopy(metric_dict) results_uniformx_knn=deepcopy(metric_dict) foriinrange(30): clf=RandomForestClassifier(random_state=i) X_random, y_random=oversample_random(X_train, y_train, N_ROWS_1, i) X_smote_1, y_smote_1=oversample_smote(X_train, y_train, N_ROWS_1, 1, i) X_smote_3, y_smote_3=oversample_smote(X_train, y_train, N_ROWS_1, 3, i) X_smote_5, y_smote_5=oversample_smote(X_train, y_train, N_ROWS_1, 5, i) X_smote_10, y_smote_10=oversample_smote(X_train, y_train, N_ROWS_1, 10, i) X_singlex, y_singlex=oversample_crossover( X_train, y_train, N_ROWS_1, mode="single", knn=False, random_state=i) X_singlex_knn, y_singlex_knn=oversample_crossover( X_train, y_train, N_ROWS_1, mode="single", knn=True, random_state=i) X_twox, y_twox=oversample_crossover( X_train, y_train, N_ROWS_1, mode="two", knn=False, random_state=i) X_twox_knn, y_twox_knn=oversample_crossover( X_train, y_train, N_ROWS_1, mode="two", knn=True, random_state=i) X_uniformx, y_uniformx=oversample_crossover( X_train, y_train, N_ROWS_1, mode="uniform", knn=False, random_state=i) X_uniformx_knn, y_uniformx_knn=oversample_crossover( X_train, y_train, N_ROWS_1, mode="uniform", knn=True, random_state=i) model=clf.fit(X_train, y_train) roc_auc=roc_auc_score(y_test, model.predict_proba(X_test)[:, 1]) precision_, recall_, _=precision_recall_curve( y_test, model.predict_proba(X_test)[:, 1]) f1_= (2*precision_*recall_) / (recall_+precision_) pr_auc=auc(recall_, precision_) max_f1=np.max(f1_) recall=recall_score(y_test, model.predict(X_test)) precision=precision_score(y_test, model.predict(X_test)) f1=f1_score(y_test, model.predict(X_test)) balanced_accuracy=balanced_accuracy_score(y_test, model.predict(X_test)) results_original["ROC AUC"].append(roc_auc) results_original["PR AUC"].append(pr_auc) results_original["Max F1"].append(max_f1) results_original["Balanced Accuracy"].append(balanced_accuracy) results_original["Recall"].append(recall) results_original["Precision"].append(precision) results_original["F1"].append(f1) model=clf.fit(X_random, y_random) roc_auc=roc_auc_score(y_test, model.predict_proba(X_test)[:, 1]) precision_, recall_, _=precision_recall_curve( y_test, model.predict_proba(X_test)[:, 1]) f1_= (2*precision_*recall_) / (recall_+precision_) pr_auc=auc(recall_, precision_) max_f1=np.max(f1_) recall=recall_score(y_test, model.predict(X_test)) precision=precision_score(y_test, model.predict(X_test)) f1=f1_score(y_test, model.predict(X_test)) balanced_accuracy=balanced_accuracy_score(y_test, model.predict(X_test)) results_random["ROC AUC"].append(roc_auc) results_random["PR AUC"].append(pr_auc) results_random["Max F1"].append(max_f1) results_random["Balanced Accuracy"].append(balanced_accuracy) results_random["Recall"].append(recall) results_random["Precision"].append(precision) results_random["F1"].append(f1) model=clf.fit(X_smote_1, y_smote_1) roc_auc=roc_auc_score(y_test, model.predict_proba(X_test)[:, 1]) precision_, recall_, _=precision_recall_curve( y_test, model.predict_proba(X_test)[:, 1]) f1_= (2*precision_*recall_) / (recall_+precision_) pr_auc=auc(recall_, precision_) max_f1=np.max(f1_) recall=recall_score(y_test, model.predict(X_test)) precision=precision_score(y_test, model.predict(X_test)) f1=f1_score(y_test, model.predict(X_test)) balanced_accuracy=balanced_accuracy_score(y_test, model.predict(X_test)) results_smote_1["ROC AUC"].append(roc_auc) results_smote_1["PR AUC"].append(pr_auc) results_smote_1["Max F1"].append(max_f1) results_smote_1["Balanced Accuracy"].append(balanced_accuracy) results_smote_1["Recall"].append(recall) results_smote_1["Precision"].append(precision) results_smote_1["F1"].append(f1) model=clf.fit(X_smote_3, y_smote_3) roc_auc=roc_auc_score(y_test, model.predict_proba(X_test)[:, 1]) precision_, recall_, _=precision_recall_curve( y_test, model.predict_proba(X_test)[:, 1]) f1_= (2*precision_*recall_) / (recall_+precision_) pr_auc=auc(recall_, precision_) max_f1=np.max(f1_) recall=recall_score(y_test, model.predict(X_test)) precision=precision_score(y_test, model.predict(X_test)) f1=f1_score(y_test, model.predict(X_test)) balanced_accuracy=balanced_accuracy_score(y_test, model.predict(X_test)) results_smote_3["ROC AUC"].append(roc_auc) results_smote_3["PR AUC"].append(pr_auc) results_smote_3["Max F1"].append(max_f1) results_smote_3["Balanced Accuracy"].append(balanced_accuracy) results_smote_3["Recall"].append(recall) results_smote_3["Precision"].append(precision) results_smote_3["F1"].append(f1) model=clf.fit(X_smote_5, y_smote_5) roc_auc=roc_auc_score(y_test, model.predict_proba(X_test)[:, 1]) precision_, recall_, _=precision_recall_curve( y_test, model.predict_proba(X_test)[:, 1]) f1_= (2*precision_*recall_) / (recall_+precision_) pr_auc=auc(recall_, precision_) max_f1=np.max(f1_) recall=recall_score(y_test, model.predict(X_test)) precision=precision_score(y_test, model.predict(X_test)) f1=f1_score(y_test, model.predict(X_test)) balanced_accuracy=balanced_accuracy_score(y_test, model.predict(X_test)) results_smote_5["ROC AUC"].append(roc_auc) results_smote_5["PR AUC"].append(pr_auc) results_smote_5["Max F1"].append(max_f1) results_smote_5["Balanced Accuracy"].append(balanced_accuracy) results_smote_5["Recall"].append(recall) results_smote_5["Precision"].append(precision) results_smote_5["F1"].append(f1) model=clf.fit(X_smote_10, y_smote_10) roc_auc=roc_auc_score(y_test, model.predict_proba(X_test)[:, 1]) precision_, recall_, _=precision_recall_curve( y_test, model.predict_proba(X_test)[:, 1]) f1_= (2*precision_*recall_) / (recall_+precision_) pr_auc=auc(recall_, precision_) max_f1=np.max(f1_) recall=recall_score(y_test, model.predict(X_test)) precision=precision_score(y_test, model.predict(X_test)) f1=f1_score(y_test, model.predict(X_test)) balanced_accuracy=balanced_accuracy_score(y_test, model.predict(X_test)) results_smote_10["ROC AUC"].append(roc_auc) results_smote_10["PR AUC"].append(pr_auc) results_smote_10["Max F1"].append(max_f1) results_smote_10["Balanced Accuracy"].append(balanced_accuracy) results_smote_10["Recall"].append(recall) results_smote_10["Precision"].append(precision) results_smote_10["F1"].append(f1) model=clf.fit(X_singlex, y_singlex) roc_auc=roc_auc_score(y_test, model.predict_proba(X_test)[:, 1]) precision_, recall_, _=precision_recall_curve( y_test, model.predict_proba(X_test)[:, 1]) f1_= (2*precision_*recall_) / (recall_+precision_) pr_auc=auc(recall_, precision_) max_f1=np.max(f1_) recall=recall_score(y_test, model.predict(X_test)) precision=precision_score(y_test, model.predict(X_test)) f1=f1_score(y_test, model.predict(X_test)) balanced_accuracy=balanced_accuracy_score(y_test, model.predict(X_test)) results_singlex["ROC AUC"].append(roc_auc) results_singlex["PR AUC"].append(pr_auc) results_singlex["Max F1"].append(max_f1) results_singlex["Balanced Accuracy"].append(balanced_accuracy) results_singlex["Recall"].append(recall) results_singlex["Precision"].append(precision) results_singlex["F1"].append(f1) model=clf.fit(X_singlex_knn, y_singlex_knn) roc_auc=roc_auc_score(y_test, model.predict_proba(X_test)[:, 1]) precision_, recall_, _=precision_recall_curve( y_test, model.predict_proba(X_test)[:, 1]) f1_= (2*precision_*recall_) / (recall_+precision_) pr_auc=auc(recall_, precision_) max_f1=np.max(f1_) recall=recall_score(y_test, model.predict(X_test)) precision=precision_score(y_test, model.predict(X_test)) f1=f1_score(y_test, model.predict(X_test)) balanced_accuracy=balanced_accuracy_score(y_test, model.predict(X_test)) results_singlex_knn["ROC AUC"].append(roc_auc) results_singlex_knn["PR AUC"].append(pr_auc) results_singlex_knn["Max F1"].append(max_f1) results_singlex_knn["Balanced Accuracy"].append(balanced_accuracy) results_singlex_knn["Recall"].append(recall) results_singlex_knn["Precision"].append(precision) results_singlex_knn["F1"].append(f1) model=clf.fit(X_twox, y_twox) roc_auc=roc_auc_score(y_test, model.predict_proba(X_test)[:, 1]) precision_, recall_, _=precision_recall_curve( y_test, model.predict_proba(X_test)[:, 1]) f1_= (2*precision_*recall_) / (recall_+precision_) pr_auc=auc(recall_, precision_) max_f1=np.max(f1_) recall=recall_score(y_test, model.predict(X_test)) precision=precision_score(y_test, model.predict(X_test)) f1=f1_score(y_test, model.predict(X_test)) balanced_accuracy=balanced_accuracy_score(y_test, model.predict(X_test)) results_twox["ROC AUC"].append(roc_auc) results_twox["PR AUC"].append(pr_auc) results_twox["Max F1"].append(max_f1) results_twox["Balanced Accuracy"].append(balanced_accuracy) results_twox["Recall"].append(recall) results_twox["Precision"].append(precision) results_twox["F1"].append(f1) model=clf.fit(X_twox_knn, y_twox_knn) roc_auc=roc_auc_score(y_test, model.predict_proba(X_test)[:, 1]) precision_, recall_, _=precision_recall_curve( y_test, model.predict_proba(X_test)[:, 1]) f1_= (2*precision_*recall_) / (recall_+precision_) pr_auc=auc(recall_, precision_) max_f1=np.max(f1_) recall=recall_score(y_test, model.predict(X_test)) precision=precision_score(y_test, model.predict(X_test)) f1=f1_score(y_test, model.predict(X_test)) balanced_accuracy=balanced_accuracy_score(y_test, model.predict(X_test)) results_twox_knn["ROC AUC"].append(roc_auc) results_twox_knn["PR AUC"].append(pr_auc) results_twox_knn["Max F1"].append(max_f1) results_twox_knn["Balanced Accuracy"].append(balanced_accuracy) results_twox_knn["Recall"].append(recall) results_twox_knn["Precision"].append(precision) results_twox_knn["F1"].append(f1) model=clf.fit(X_uniformx, y_uniformx) roc_auc=roc_auc_score(y_test, model.predict_proba(X_test)[:, 1]) precision_, recall_, _=precision_recall_curve( y_test, model.predict_proba(X_test)[:, 1]) f1_= (2*precision_*recall_) / (recall_+precision_) pr_auc=auc(recall_, precision_) max_f1=np.max(f1_) recall=recall_score(y_test, model.predict(X_test)) precision=precision_score(y_test, model.predict(X_test)) f1=f1_score(y_test, model.predict(X_test)) balanced_accuracy=balanced_accuracy_score(y_test, model.predict(X_test)) results_uniformx["ROC AUC"].append(roc_auc) results_uniformx["PR AUC"].append(pr_auc) results_uniformx["Max F1"].append(max_f1) results_uniformx["Balanced Accuracy"].append(balanced_accuracy) results_uniformx["Recall"].append(recall) results_uniformx["Precision"].append(precision) results_uniformx["F1"].append(f1) model=clf.fit(X_uniformx_knn, y_uniformx_knn) roc_auc=roc_auc_score(y_test, model.predict_proba(X_test)[:, 1]) precision_, recall_, _=precision_recall_curve( y_test, model.predict_proba(X_test)[:, 1]) f1_= (2*precision_*recall_) / (recall_+precision_) pr_auc=auc(recall_, precision_) max_f1=np.max(f1_) recall=recall_score(y_test, model.predict(X_test)) precision=precision_score(y_test, model.predict(X_test)) f1=f1_score(y_test, model.predict(X_test)) balanced_accuracy=balanced_accuracy_score(y_test, model.predict(X_test)) results_uniformx_knn["ROC AUC"].append(roc_auc) results_uniformx_knn["PR AUC"].append(pr_auc) results_uniformx_knn["Max F1"].append(max_f1) results_uniformx_knn["Balanced Accuracy"].append(balanced_accuracy) results_uniformx_knn["Recall"].append(recall) results_uniformx_knn["Precision"].append(precision) results_uniformx_knn["F1"].append(f1) metrics= ["ROC AUC", "PR AUC", "Max F1", "Balanced Accuracy", "Recall", "Precision", "F1"] formetricinmetrics: plt.figure(figsize=(30, 10)) data=pd.DataFrame(data={ "Original": results_original[metric], "Random": results_random[metric], "SMOTE\nk=1": results_smote_1[metric], "SMOTE\nk=3": results_smote_3[metric], "SMOTE\nk=5": results_smote_5[metric], "SMOTE\nk=10": results_smote_10[metric], "Crossover\nSingle": results_singlex[metric], "Crossover\nSingle KNN": results_singlex_knn[metric], "Crossover\nTwo-points": results_twox[metric], "Crossover\nTwo-points KNN": results_twox_knn[metric], "Crossover\nUniform": results_uniformx[metric], "Crossover\nUniform KNN": results_uniformx_knn[metric], }) sns.boxplot(data=data) plt.title(metric) plt.show()