导入需要的包
import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns import datetime from tqdm import tqdm from sklearn.preprocessing import LabelEncoder from sklearn.feature_selection import SelectKBest from sklearn.feature_selection import chi2 from sklearn.preprocessing import MinMaxScaler import xgboost as xgb import lightgbm as lgb from catboost import CatBoostRegressor import warnings from sklearn.model_selection import StratifiedKFold, KFold from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, log_loss warnings.filterwarnings('ignore')
读取文件
data_train =pd.read_csv('./train.csv') data_test_a = pd.read_csv('./testA.csv')
特征预处理
首先我们查找出数据中的对象特征和数值特征
对数据需要有一定了解
numerical_fea = list(data_train.select_dtypes(exclude=['object']).columns) category_fea = list(filter(lambda x: x not in numerical_fea,list(data_train.columns))) label = 'isDefault' numerical_fea.remove(label)
数据大概了解
data_train.info() • 1
<class 'pandas.core.frame.DataFrame'> RangeIndex: 800000 entries, 0 to 799999 Data columns (total 47 columns): id 800000 non-null int64 loanAmnt 800000 non-null float64 term 800000 non-null int64 interestRate 800000 non-null float64 installment 800000 non-null float64 grade 800000 non-null object subGrade 800000 non-null object employmentTitle 799999 non-null float64 employmentLength 753201 non-null object homeOwnership 800000 non-null int64 annualIncome 800000 non-null float64 verificationStatus 800000 non-null int64 issueDate 800000 non-null object isDefault 800000 non-null int64 purpose 800000 non-null int64 postCode 799999 non-null float64 regionCode 800000 non-null int64 dti 799761 non-null float64 delinquency_2years 800000 non-null float64 ficoRangeLow 800000 non-null float64 ficoRangeHigh 800000 non-null float64 openAcc 800000 non-null float64 pubRec 800000 non-null float64 pubRecBankruptcies 799595 non-null float64 revolBal 800000 non-null float64 revolUtil 799469 non-null float64 totalAcc 800000 non-null float64 initialListStatus 800000 non-null int64 applicationType 800000 non-null int64 earliesCreditLine 800000 non-null object title 799999 non-null float64 policyCode 800000 non-null float64 n0 759730 non-null float64 n1 759730 non-null float64 n2 759730 non-null float64 n2.1 759730 non-null float64 n4 766761 non-null float64 n5 759730 non-null float64 n6 759730 non-null float64 n7 759730 non-null float64 n8 759729 non-null float64 n9 759730 non-null float64 n10 766761 non-null float64 n11 730248 non-null float64 n12 759730 non-null float64 n13 759730 non-null float64 n14 759730 non-null float64 dtypes: float64(33), int64(9), object(5) memory usage: 286.9+ MB
#查看数值的特征 numerical_fea
['id', 'loanAmnt', 'term', 'interestRate', 'installment', 'employmentTitle', 'homeOwnership', 'annualIncome', 'verificationStatus', 'purpose', 'postCode', 'regionCode', 'dti', 'delinquency_2years', 'ficoRangeLow', 'ficoRangeHigh', 'openAcc', 'pubRec', 'pubRecBankruptcies', 'revolBal', 'revolUtil', 'totalAcc', 'initialListStatus', 'applicationType', 'title', 'policyCode', 'n0', 'n1', 'n2', 'n2.1', 'n4', 'n5', 'n6', 'n7', 'n8', 'n9', 'n10', 'n11', 'n12', 'n13', 'n14']
缺失值处理
- 把所有缺失值替换成指定值
- 用上面值填充ffill
- 用下面值填充bfill
#查看缺失值情况 data_train.isnull().sum()
id 0 loanAmnt 0 term 0 interestRate 0 installment 0 grade 0 subGrade 0 employmentTitle 1 employmentLength 46799 homeOwnership 0 annualIncome 0 verificationStatus 0 issueDate 0 isDefault 0 purpose 0 postCode 1 regionCode 0 dti 239 delinquency_2years 0 ficoRangeLow 0 ficoRangeHigh 0 openAcc 0 pubRec 0 pubRecBankruptcies 405 revolBal 0 revolUtil 531 totalAcc 0 initialListStatus 0 applicationType 0 earliesCreditLine 0 title 1 policyCode 0 n0 40270 n1 40270 n2 40270 n2.1 40270 n4 33239 n5 40270 n6 40270 n7 40270 n8 40271 n9 40270 n10 33239 n11 69752 n12 40270 n13 40270 n14 40270 dtype: int64
这里我们可以平均值填充也可以众数。注意mode方法。
#这里的mode方法是去每一列的众数 data_train[category_fea].mode()
grade | subGrade | employmentLength | issueDate | earliesCreditLine | |
0 | B | C1 | 10+ years | 2016-03-01 | Aug-2001 |
#按照平均数填充数值型特征 data_train[numerical_fea] = data_train[numerical_fea].fillna(data_train[numerical_fea].median()) data_test_a[numerical_fea] = data_test_a[numerical_fea].fillna(data_train[numerical_fea].median()) #按照众数填充类别型特征 data_train[category_fea] = data_train[category_fea].fillna(data_train[category_fea].mode()) data_test_a[category_fea] = data_test_a[category_fea].fillna(data_train[category_fea].mode())
data_train.isnull().sum()
id 0 loanAmnt 0 term 0 interestRate 0 installment 0 grade 0 subGrade 0 employmentTitle 0 employmentLength 46799 homeOwnership 0 annualIncome 0 verificationStatus 0 issueDate 0 isDefault 0 purpose 0 postCode 0 regionCode 0 dti 0 delinquency_2years 0 ficoRangeLow 0 ficoRangeHigh 0 openAcc 0 pubRec 0 pubRecBankruptcies 0 revolBal 0 revolUtil 0 totalAcc 0 initialListStatus 0 applicationType 0 earliesCreditLine 0 title 0 policyCode 0 n0 0 n1 0 n2 0 n2.1 0 n4 0 n5 0 n6 0 n7 0 n8 0 n9 0 n10 0 n11 0 n12 0 n13 0 n14 0 dtype: int64
#查看类别特征 category_fea
['grade', 'subGrade', 'employmentLength', 'issueDate', 'earliesCreditLine']
时间格式处理
#转化成时间格式 for data in [data_train, data_test_a]: data['issueDate'] = pd.to_datetime(data['issueDate'],format='%Y-%m-%d') startdate = datetime.datetime.strptime('2007-06-01', '%Y-%m-%d') #构造时间特征 data['issueDateDT'] = data['issueDate'].apply(lambda x: x-startdate).dt.days
data_train['employmentLength'].value_counts(dropna=False).sort_index()
1 year 52489 10+ years 262753 2 years 72358 3 years 64152 4 years 47985 5 years 50102 6 years 37254 7 years 35407 8 years 36192 9 years 30272 < 1 year 64237 NaN 46799 Name: employmentLength, dtype: int64
对象特征转化数值特征
def employmentLength_to_int(s): if pd.isnull(s): return s else: return np.int8(s.split()[0]) for data in [data_train, data_test_a]: data['employmentLength'].replace(to_replace='10+ years', value='10 years', inplace=True) data['employmentLength'].replace('< 1 year', '0 years', inplace=True) data['employmentLength'] = data['employmentLength'].apply(employmentLength_to_int)
data['employmentLength'].value_counts(dropna=False).sort_index()
0.0 15989 1.0 13182 2.0 18207 3.0 16011 4.0 11833 5.0 12543 6.0 9328 7.0 8823 8.0 8976 9.0 7594 10.0 65772 NaN 11742 Name: employmentLength, dtype: int64
sample方法随机取这一列的5个值
data_train['earliesCreditLine'].sample(5) • 1
250886 Nov-2000 528953 Sep-2000 703638 Jun-1989 80494 Mar-2009 715192 Dec-2003 Name: earliesCreditLine, dtype: object
for data in [data_train, data_test_a]: data['earliesCreditLine'] = data['earliesCreditLine'].apply(lambda s: int(s[-4:]))
类别特征处理
# 部分类别特征 cate_features = ['grade', 'subGrade', 'employmentTitle', 'homeOwnership', 'verificationStatus', 'purpose', 'postCode', 'regionCode', \ 'applicationType', 'initialListStatus', 'title', 'policyCode'] for f in cate_features: print(f, '类型数:', data[f].nunique())
grade 类型数: 7 subGrade 类型数: 35 employmentTitle 类型数: 79282 homeOwnership 类型数: 6 verificationStatus 类型数: 3 purpose 类型数: 14 postCode 类型数: 889 regionCode 类型数: 51 applicationType 类型数: 2 initialListStatus 类型数: 2 title 类型数: 12058 policyCod,e 类型数: 1
等级这种固定的类别特征可以使用匿名函数或者map去换比较方便。
for data in [data_train, data_test_a]: data['grade'] = data['grade'].map({'A':1,'B':2,'C':3,'D':4,'E':5,'F':6,'G':7})
类型数在2之上,又不是高维稀疏的,且纯分类特征
# 类型数在2之上,又不是高维稀疏的,且纯分类特征 #get_dummies这个方法可以构建矩阵 for data in [data_train, data_test_a]: data = pd.get_dummies(data, columns=['subGrade', 'homeOwnership', 'verificationStatus', 'purpose', 'regionCode'], drop_first=True)
异常值处理
- 当我们在发现异常值得时候,一定要分清楚是什么原因导致异常值的。如果这异常值并不代表一种规律性话,是偶然的现象,或者说你并不想研究这种偶然的现象,这时可以将其删除。如果异常值存在且代表了一种真实存在的现象,那就不能随便删除。在现有的欺诈场景中很多时候欺诈数据本身相对于正常数据勒说就是异常的,我们要把这些异常点纳入,重新拟合模型,研究其规律。能用监督的用监督模型,不能用的还可以考虑用异常检测的算法来做。
- 注意test的数据不能删
对于检查异常方法之一:均方差
基本上一些连续的数据都符合正态分布的
方法二:箱型图
将数据分成几个部分
def find_outliers_by_3segama(data,fea): data_std = np.std(data[fea]) data_mean = np.mean(data[fea]) outliers_cut_off = data_std * 3 lower_rule = data_mean - outliers_cut_off upper_rule = data_mean + outliers_cut_off data[fea+'_outliers'] = data[fea].apply(lambda x:str('异常值') if x > upper_rule or x < lower_rule else '正常值') return data
data_train = data_train.copy() for fea in numerical_fea: data_train = find_outliers_by_3segama(data_train,fea) print(data_train[fea+'_outliers'].value_counts()) print(data_train.groupby(fea+'_outliers')['isDefault'].sum()) print('*'*10)
正常值 800000 Name: id_outliers, dtype: int64 id_outliers 正常值 159610 Name: isDefault, dtype: int64 ********** 正常值 800000 Name: loanAmnt_outliers, dtype: int64 loanAmnt_outliers 正常值 159610 Name: isDefault, dtype: int64 ********** 正常值 800000 Name: term_outliers, dtype: int64 term_outliers 正常值 159610 Name: isDefault, dtype: int64 ********** 正常值 794259 异常值 5741 Name: interestRate_outliers, dtype: int64 interestRate_outliers 异常值 2916 正常值 156694 Name: isDefault, dtype: int64 ********** 正常值 792046 异常值 7954 Name: installment_outliers, dtype: int64 installment_outliers 异常值 2152 正常值 157458 Name: isDefault, dtype: int64 ********** 正常值 800000 Name: employmentTitle_outliers, dtype: int64 employmentTitle_outliers 正常值 159610 Name: isDefault, dtype: int64 ********** 正常值 799701 异常值 299 Name: homeOwnership_outliers, dtype: int64 homeOwnership_outliers 异常值 62 正常值 159548 Name: isDefault, dtype: int64 ********** 正常值 793973 异常值 6027 Name: annualIncome_outliers, dtype: int64 annualIncome_outliers 异常值 756 正常值 158854 Name: isDefault, dtype: int64 ********** 正常值 800000 Name: verificationStatus_outliers, dtype: int64 verificationStatus_outliers 正常值 159610 Name: isDefault, dtype: int64 ********** 正常值 783003 异常值 16997 Name: purpose_outliers, dtype: int64 purpose_outliers 异常值 3635 正常值 155975 Name: isDefault, dtype: int64 ********** 正常值 798931 异常值 1069 Name: postCode_outliers, dtype: int64 postCode_outliers 异常值 221 正常值 159389 Name: isDefault, dtype: int64 ********** 正常值 799994 异常值 6 Name: regionCode_outliers, dtype: int64 regionCode_outliers 异常值 1 正常值 159609 Name: isDefault, dtype: int64 ********** 正常值 798440 异常值 1560 Name: dti_outliers, dtype: int64 dti_outliers 异常值 466 正常值 159144 Name: isDefault, dtype: int64 ********** 正常值 778245 异常值 21755 Name: delinquency_2years_outliers, dtype: int64 delinquency_2years_outliers 异常值 5089 正常值 154521 Name: isDefault, dtype: int64 ********** 正常值 788261 异常值 11739 Name: ficoRangeLow_outliers, dtype: int64 ficoRangeLow_outliers 异常值 778 正常值 158832 Name: isDefault, dtype: int64 ********** 正常值 788261 异常值 11739 Name: ficoRangeHigh_outliers, dtype: int64 ficoRangeHigh_outliers 异常值 778 正常值 158832 Name: isDefault, dtype: int64 ********** 正常值 790889 异常值 9111 Name: openAcc_outliers, dtype: int64 openAcc_outliers 异常值 2195 正常值 157415 Name: isDefault, dtype: int64 ********** 正常值 792471 异常值 7529 Name: pubRec_outliers, dtype: int64 pubRec_outliers 异常值 1701 正常值 157909 Name: isDefault, dtype: int64 ********** 正常值 794120 异常值 5880 Name: pubRecBankruptcies_outliers, dtype: int64 pubRecBankruptcies_outliers 异常值 1423 正常值 158187 Name: isDefault, dtype: int64 ********** 正常值 790001 异常值 9999 Name: revolBal_outliers, dtype: int64 revolBal_outliers 异常值 1359 正常值 158251 Name: isDefault, dtype: int64 ********** 正常值 799948 异常值 52 Name: revolUtil_outliers, dtype: int64 revolUtil_outliers 异常值 23 正常值 159587 Name: isDefault, dtype: int64 ********** 正常值 791663 异常值 8337 Name: totalAcc_outliers, dtype: int64 totalAcc_outliers 异常值 1668 正常值 157942 Name: isDefault, dtype: int64 ********** 正常值 800000 Name: initialListStatus_outliers, dtype: int64 initialListStatus_outliers 正常值 159610 Name: isDefault, dtype: int64 ********** 正常值 784586 异常值 15414 Name: applicationType_outliers, dtype: int64 applicationType_outliers 异常值 3875 正常值 155735 Name: isDefault, dtype: int64 ********** 正常值 775134 异常值 24866 Name: title_outliers, dtype: int64 title_outliers 异常值 3900 正常值 155710 Name: isDefault, dtype: int64 ********** 正常值 800000 Name: policyCode_outliers, dtype: int64 policyCode_outliers 正常值 159610 Name: isDefault, dtype: int64 ********** 正常值 782773 异常值 17227 Name: n0_outliers, dtype: int64 n0_outliers 异常值 3485 正常值 156125 Name: isDefault, dtype: int64 ********** 正常值 790500 异常值 9500 Name: n1_outliers, dtype: int64 n1_outliers 异常值 2491 正常值 157119 Name: isDefault, dtype: int64 ********** 正常值 789067 异常值 10933 Name: n2_outliers, dtype: int64 n2_outliers 异常值 3205 正常值 156405 Name: isDefault, dtype: int64 ********** 正常值 789067 异常值 10933 Name: n2.1_outliers, dtype: int64 n2.1_outliers 异常值 3205 正常值 156405 Name: isDefault, dtype: int64 ********** 正常值 788660 异常值 11340 Name: n4_outliers, dtype: int64 n4_outliers 异常值 2476 正常值 157134 Name: isDefault, dtype: int64 ********** 正常值 790355 异常值 9645 Name: n5_outliers, dtype: int64 n5_outliers 异常值 1858 正常值 157752 Name: isDefault, dtype: int64 ********** 正常值 786006 异常值 13994 Name: n6_outliers, dtype: int64 n6_outliers 异常值 3182 正常值 156428 Name: isDefault, dtype: int64 ********** 正常值 788430 异常值 11570 Name: n7_outliers, dtype: int64 n7_outliers 异常值 2746 正常值 156864 Name: isDefault, dtype: int64 ********** 正常值 789625 异常值 10375 Name: n8_outliers, dtype: int64 n8_outliers 异常值 2131 正常值 157479 Name: isDefault, dtype: int64 ********** 正常值 786384 异常值 13616 Name: n9_outliers, dtype: int64 n9_outliers 异常值 3953 正常值 155657 Name: isDefault, dtype: int64 ********** 正常值 788979 异常值 11021 Name: n10_outliers, dtype: int64 n10_outliers 异常值 2639 正常值 156971 Name: isDefault, dtype: int64 ********** 正常值 799434 异常值 566 Name: n11_outliers, dtype: int64 n11_outliers 异常值 112 正常值 159498 Name: isDefault, dtype: int64 ********** 正常值 797585 异常值 2415 Name: n12_outliers, dtype: int64 n12_outliers 异常值 545 正常值 159065 Name: isDefault, dtype: int64 ********** 正常值 788907 异常值 11093 Name: n13_outliers, dtype: int64 n13_outliers 异常值 2482 正常值 157128 Name: isDefault, dtype: int64 ********** 正常值 788884 异常值 11116 Name: n14_outliers, dtype: int64 n14_outliers 异常值 3364 正常值 156246 Name: isDefault, dtype: int64 **********
删除异常值
#删除异常值 for fea in numerical_fea: data_train = data_train[data_train[fea+'_outliers']=='正常值'] data_train = data_train.reset_index(drop=True)
通过除法映射到间隔均匀的分箱中
# 通过除法映射到间隔均匀的分箱中,每个分箱的取值范围都是loanAmnt/1000 data['loanAmnt_bin1'] = np.floor_divide(data['loanAmnt'], 1000)
通过对数函数映射到指数宽度分箱
## 通过对数函数映射到指数宽度分箱 data['loanAmnt_bin2'] = np.floor(np.log10(data['loanAmnt']))
data['loanAmnt_bin3'] = pd.qcut(data['loanAmnt'], 10, labels=False)
for col in ['grade', 'subGrade']: temp_dict = data_train.groupby([col])['isDefault'].agg(['mean']).reset_index().rename(columns={'mean': col + '_target_mean'}) temp_dict.index = temp_dict[col].values temp_dict = temp_dict[col + '_target_mean'].to_dict() data_train[col + '_target_mean'] = data_train[col].map(temp_dict) data_test_a[col + '_target_mean'] = data_test_a[col].map(temp_dict)
# 其他衍生变量 mean 和 std for df in [data_train, data_test_a]: for item in ['n0','n1','n2','n2.1','n4','n5','n6','n7','n8','n9','n10','n11','n12','n13','n14']: df['grade_to_mean_' + item] = df['grade'] / df.groupby([item])['grade'].transform('mean') df['grade_to_std_' + item] = df['grade'] / df.groupby([item])['grade'].transform('std')
特征交互
想要丰富特征,特别是对于线性模型而言,除了分箱外,另一种方法是添加原始数据的交互特征和多项式特征。
#label-encode:subGrade,postCode,title # 高维类别特征需要进行转换 for col in tqdm(['employmentTitle', 'postCode', 'title','subGrade']): le = LabelEncoder() le.fit(list(data_train[col].astype(str).values) + list(data_test_a[col].astype(str).values)) data_train[col] = le.transform(list(data_train[col].astype(str).values)) data_test_a[col] = le.transform(list(data_test_a[col].astype(str).values)) print('Label Encoding 完成')
100%|████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:12<00:00, 3.15s/it] Label Encoding 完成
# 删除不需要的数据 for data in [data_train, data_test_a]: data.drop(['issueDate'], axis=1,inplace=True)
"纵向用缺失值上面的值替换缺失值" data_train = data_train.fillna(axis=0,method='ffill') x_train = data_train.drop(['isDefault','id'], axis=1)
#计算协方差 data_corr = x_train.corrwith(data_train.isDefault) #计算相关性 result = pd.DataFrame(columns=['features', 'corr']) result['features'] = data_corr.index result['corr'] = data_corr.values
# 也可以画图图 data_numeric = data_train[numerical_fea] correlation = data_numeric.corr() f , ax = plt.subplots(figsize = (7, 7)) plt.title('Correlation of Numeric Features with Price',y=1,size=16) sns.heatmap(correlation,square = True, vmax=0.8)
<matplotlib.axes._subplots.AxesSubplot at 0x23837a36f88>
features = [f for f in data_train.columns if f not in ['id','issueDate','isDefault'] and '_outliers' not in f] x_train = data_train[features] x_test = data_test_a[features] y_train = data_train['isDefault']
def cv_model(clf, train_x, train_y, test_x, clf_name): folds = 5 seed = 2020 kf = KFold(n_splits=folds, shuffle=True, random_state=seed) train = np.zeros(train_x.shape[0]) test = np.zeros(test_x.shape[0]) cv_scores = [] for i, (train_index, valid_index) in enumerate(kf.split(train_x, train_y)): print('************************************ {} ************************************'.format(str(i+1))) trn_x, trn_y, val_x, val_y = train_x.iloc[train_index], train_y[train_index], train_x.iloc[valid_index], train_y[valid_index] if clf_name == "lgb": train_matrix = clf.Dataset(trn_x, label=trn_y) valid_matrix = clf.Dataset(val_x, label=val_y) params = { 'boosting_type': 'gbdt', 'objective': 'binary', 'metric': 'auc', 'min_child_weight': 5, 'num_leaves': 2 ** 5, 'lambda_l2': 10, 'feature_fraction': 0.8, 'bagging_fraction': 0.8, 'bagging_freq': 4, 'learning_rate': 0.1, 'seed': 2020, 'nthread': 28, 'n_jobs':24, 'silent': True, 'verbose': -1, } model = clf.train(params, train_matrix, 50000, valid_sets=[train_matrix, valid_matrix], verbose_eval=200,early_stopping_rounds=200) val_pred = model.predict(val_x, num_iteration=model.best_iteration) test_pred = model.predict(test_x, num_iteration=model.best_iteration) # print(list(sorted(zip(features, model.feature_importance("gain")), key=lambda x: x[1], reverse=True))[:20]) if clf_name == "xgb": train_matrix = clf.DMatrix(trn_x , label=trn_y) valid_matrix = clf.DMatrix(val_x , label=val_y) params = {'booster': 'gbtree', 'objective': 'binary:logistic', 'eval_metric': 'auc', 'gamma': 1, 'min_child_weight': 1.5, 'max_depth': 5, 'lambda': 10, 'subsample': 0.7, 'colsample_bytree': 0.7, 'colsample_bylevel': 0.7, 'eta': 0.04, 'tree_method': 'exact', 'seed': 2020, 'nthread': 36, "silent": True, } watchlist = [(train_matrix, 'train'),(valid_matrix, 'eval')] model = clf.train(params, train_matrix, num_boost_round=50000, evals=watchlist, verbose_eval=200, early_stopping_rounds=200) val_pred = model.predict(valid_matrix, ntree_limit=model.best_ntree_limit) test_pred = model.predict(test_x , ntree_limit=model.best_ntree_limit) if clf_name == "cat": params = {'learning_rate': 0.05, 'depth': 5, 'l2_leaf_reg': 10, 'bootstrap_type': 'Bernoulli', 'od_type': 'Iter', 'od_wait': 50, 'random_seed': 11, 'allow_writing_files': False} model = clf(iterations=20000, **params) model.fit(trn_x, trn_y, eval_set=(val_x, val_y), cat_features=[], use_best_model=True, verbose=500) val_pred = model.predict(val_x) test_pred = model.predict(test_x) train[valid_index] = val_pred test = test_pred / kf.n_splits cv_scores.append(roc_auc_score(val_y, val_pred)) print(cv_scores) print("%s_scotrainre_list:" % clf_name, cv_scores) print("%s_score_mean:" % clf_name, np.mean(cv_scores)) print("%s_score_std:" % clf_name, np.std(cv_scores)) return train, test
def lgb_model(x_train, y_train, x_test): lgb_train, lgb_test = cv_model(lgb, x_train, y_train, x_test, "lgb") return lgb_train, lgb_test def xgb_model(x_train, y_train, x_test): xgb_train, xgb_test = cv_model(xgb, x_train, y_train, x_test, "xgb") return xgb_train, xgb_test def cat_model(x_train, y_train, x_test): cat_train, cat_test = cv_model(CatBoostRegressor, x_train, y_train, x_test, "cat")
lgb_train, lgb_test = lgb_model(x_train, y_train, x_test)
************************************ 1 ************************************ [LightGBM] [Warning] num_threads is set with nthread=28, will be overridden by n_jobs=24. Current value: num_threads=24 [LightGBM] [Warning] Unknown parameter: silent Training until validation scores don't improve for 200 rounds [200] training's auc: 0.749114 valid_1's auc: 0.729275 [400] training's auc: 0.764716 valid_1's auc: 0.730125 [600] training's auc: 0.778489 valid_1's auc: 0.729928 Early stopping, best iteration is: [446] training's auc: 0.768137 valid_1's auc: 0.730186 [0.7301862239949224] ************************************ 2 ************************************ [LightGBM] [Warning] num_threads is set with nthread=28, will be overridden by n_jobs=24. Current value: num_threads=24 [LightGBM] [Warning] Unknown parameter: silent Training until validation scores don't improve for 200 rounds [200] training's auc: 0.748999 valid_1's auc: 0.731035 [400] training's auc: 0.764879 valid_1's auc: 0.731436 [600] training's auc: 0.778506 valid_1's auc: 0.730823 Early stopping, best iteration is: [414] training's auc: 0.765823 valid_1's auc: 0.731478 [0.7301862239949224, 0.7314779648434573] ************************************ 3 ************************************ [LightGBM] [Warning] num_threads is set with nthread=28, will be overridden by n_jobs=24. Current value: num_threads=24 [LightGBM] [Warning] Unknown parameter: silent Training until validation scores don't improve for 200 rounds [200] training's auc: 0.748145 valid_1's auc: 0.73253 [400] training's auc: 0.763814 valid_1's auc: 0.733272 [600] training's auc: 0.777895 valid_1's auc: 0.733354 Early stopping, best iteration is: [475] training's auc: 0.769215 valid_1's auc: 0.73355 [0.7301862239949224, 0.7314779648434573, 0.7335502065719879] ************************************ 4 ************************************ [LightGBM] [Warning] num_threads is set with nthread=28, will be overridden by n_jobs=24. Current value: num_threads=24 [LightGBM] [Warning] Unknown parameter: silent Training until validation scores don't improve for 200 rounds [200] training's auc: 0.749417 valid_1's auc: 0.727507 [400] training's auc: 0.765066 valid_1's auc: 0.728261 Early stopping, best iteration is: [353] training's auc: 0.761647 valid_1's auc: 0.728349 [0.7301862239949224, 0.7314779648434573, 0.7335502065719879, 0.7283491938614568] ************************************ 5 ************************************ [LightGBM] [Warning] num_threads is set with nthread=28, will be overridden by n_jobs=24. Current value: num_threads=24 [LightGBM] [Warning] Unknown parameter: silent Training until validation scores don't improve for 200 rounds [200] training's auc: 0.748562 valid_1's auc: 0.73262 [400] training's auc: 0.764493 valid_1's auc: 0.733365 Early stopping, best iteration is: [394] training's auc: 0.764109 valid_1's auc: 0.733381 [0.7301862239949224, 0.7314779648434573, 0.7335502065719879, 0.7283491938614568, 0.7333810157041901] lgb_scotrainre_list: [0.7301862239949224, 0.7314779648434573, 0.7335502065719879, 0.7283491938614568, 0.7333810157041901] lgb_score_mean: 0.7313889209952029 lgb_score_std: 0.001966415347937543
总结
最后在此声明特征工程是数据挖掘中最为重要的一部分,在往往比赛中,还是符合八二原则。占据着最为重要一部分。一定要好好学习
最后阐述一下特征选择
特征选择
特征选择技术可以精简掉无用的特征,以降低最终模型的复杂性,它的最终目的是得到一个简约模型,在不降低预测准确率或对预测准确率影响不大的情况下提高计算速度。特征选择不是为了减少训练时间(实际上,一些技术会增加总体训练时间),而是为了减少模型评分时间。
特征选择的方法:
1、 Filter
- 方差选择法 :方差选择法中,先要计算各个特征的方差,然后根据设定的阈值,选择方差大于阈值的特征
- 相关系数法(pearson 相关系数):Pearson 相关系数 皮尔森相关系数是一种最简单的,可以帮助理解特征和响应变量之间关系的方法,该方法衡量的是变量之间的线性相关性。 结果的取值区间为 [-1,1] , -1 表示完全的负相关, +1表示完全的正相关,0 表示没有线性相关。
- 卡方检验:经典的卡方检验是用于检验自变量对因变量的相关性。 假设自变量有N种取值,因变量有M种取值,考虑自变量等于i且因变量等于j的样本频数的观察值与期望的差距。 其统计量如下: χ2=∑(A−T)2T,其中A为实际值,T为理论值
(注:卡方只能运用在正定矩阵上,否则会报错Input X must be non-negative) - 互信息法: 经典的互信息也是评价自变量对因变量的相关性的。 在feature_selection库的SelectKBest类结合最大信息系数法可以用于选择特征
2、Wrapper (RFE)
- 递归特征消除法:递归特征消除法 递归消除特征法使用一个基模型来进行多轮训练,每轮训练后,消除若干权值系数的特征,再基于新的特征集进行下一轮训练。
3、Embedded
- 基于惩罚项的特征选择法: 基于惩罚项的特征选择法 使用带惩罚项的基模型,除了筛选出特征外,同时也进行了降维。在feature_selection库的SelectFromModel类结合逻辑回归模型可以用于选择特征
- 基于树模型的特征选择:基于树模型的特征选择 树模型中GBDT也可用来作为基模型进行特征选择。 在feature_selection库的SelectFromModel类结合GBDT模型可以用于选择特征
结束语
本次任务三学习到不少知识,文章中如有不足之处,请务必指出,一定迅速改正。谢谢