前言
绘制相关性系数热力图
corr = train.corr() # 计算各变量的相关性系数 xticks = ['x0','x1','x2','x3','x4','x5','x6','x7','x8','x9','x10'] # x轴标签 yticks = list(corr.index) # y轴标签 fig = plt.figure(figsize=(10, 10)) ax = fig.add_subplot(1, 1, 1) sns.heatmap(corr, annot=True, cmap='rainbow', ax=ax, annot_kws={'size': 12, 'weight': 'bold', 'color': 'blue'}) # 绘制相关性系数热力图 ax.set_xticklabels(xticks, rotation=0, fontsize=12) ax.set_yticklabels(yticks, rotation=0, fontsize=12) plt.show()
上图可以看出变量之间的相关性都较小,但是 NumberOfOpenCreditLinesAndLoans 和 NumberRealEstateLoansOrLines 相对来说较大为0.43
将各个特征的IV值显示在柱状图上
ivlist = [ivx1, ivx2, ivx3, ivx4, ivx5, ivx6, ivx7, ivx8, ivx9, ivx10] # 各变量IV index = ['x1', 'x2', 'x3', 'x4', 'x5', 'x6', 'x7', 'x8', 'x9', 'x10'] # x轴的标签 fig1 = plt.figure(figsize=(8, 8)) ax1 = fig1.add_subplot(1, 1, 1) x = np.arange(len(index)) + 1 ax1.bar(x, ivlist, width=0.4) # 生成柱状图 ax1.set_xticks(x) ax1.set_xticklabels(index, rotation=0, fontsize=12) ax1.set_ylabel('IV(Information Value)', fontsize=12) # 在柱状图上添加数字标签 for a, b in zip(x, ivlist): plt.text(a, b + 0.01, '%.4f' % b, ha='center', va='bottom', fontsize=10) plt.show()
通过IV值判断变量预测能力的标准是: < 0.02: unpredictive 0.02 to 0.1: weak 0.1 to 0.3: medium 0.3 to 0.5: strong > 0.5: suspicious DebtRatio、MonthlyIncome、NumberRealEstateLoansOrLines 和 NumberOfDependents 变量的IV值明显较低
WOE转换
证据权重(Weight of Evidence,WOE)转换可以将Logistic回归模型转变为标准评分卡格式
# 替换成woe函数 def replace_woe(series, cut, woe): list = [] i = 0 while i < len(series): value = series[i] j = len(cut) - 2 m = len(cut) - 2 while j >= 0: if value >= cut[j]: j = -1 else: j -= 1 m -= 1 list.append(woe[m]) i += 1 return list train['RevolvingUtilizationOfUnsecuredLines'] = Series( replace_woe(train['RevolvingUtilizationOfUnsecuredLines'], cutx1, woex1))
这段代码的意思是 获取每一行的值 依次和 0.75分位、0.5分位、0.25分位的值比较 要是当前值大于某一分位的值则记录该分位的值
交叉验证可以进行模型选择
概念
将训练数据集划分为K份,K一般为10 依次取其中一份为验证集,其余为训练集训练分类器,测试分类器在验证集上的精度 取K次实验的平均精度为该分类器的平均精度
导入库
# 导入逻辑回归 from sklearn.linear_model import LogisticRegression # 导入分类器 from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier, RandomForestClassifier # k最近邻分类 from sklearn.neighbors import KNeighborsClassifier
创建分类器实例
knMod = KNeighborsClassifier(n_neighbors=5, weights='uniform', algorithm='auto', leaf_size=30, p=2, metric='minkowski', metric_params=None) lrMod = LogisticRegression(penalty='l1', dual=False, tol=0.0001, C=1.0, fit_intercept=True, intercept_scaling=1, class_weight=None, random_state=None, solver='liblinear', max_iter=100, multi_class='ovr', verbose=2) adaMod = AdaBoostClassifier(base_estimator=None, n_estimators=200, learning_rate=1.0) gbMod = GradientBoostingClassifier(loss='deviance', learning_rate=0.1, n_estimators=200, subsample=1.0, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_depth=3, init=None, random_state=None, max_features=None, verbose=0) rfMod = RandomForestClassifier(n_estimators=10, criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features='auto', max_leaf_nodes=None, bootstrap=True, oob_score=False, n_jobs=1, random_state=None, verbose=0)
使用分类算法cross_val_score
# 入参是训练数据信息 def cvDictGen(functions, scr, X_train=X_train, Y_train=Y_train, cv=10, verbose=1): cvDict = {} for func in functions: # cross_val_score将交叉验证的整个过程连接起来,不用再进行手动的分割数据 # cv参数用于规定将原始数据分成多少份 cvScore = cross_val_score(func, X_train, Y_train, cv=cv, verbose=verbose, scoring=scr) cvDict[str(func).split('(')[0]] = [cvScore.mean(), cvScore.std()] return cvDict cvD = cvDictGen(functions=[knMod, lrMod, adaMod, gbMod, rfMod], scr='roc_auc') def cvDictNormalize(cvDict): cvDictNormalized = {} for key in cvDict.keys(): for i in cvDict[key]: cvDictNormalized[key] = ['{:0.2f}'.format((cvDict[key][0] / cvDict[list(cvDict.keys())[0]][0])), '{:0.2f}'.format((cvDict[key][1] / cvDict[list(cvDict.keys())[0]][1]))] return cvDictNormalized cvDictNormalize(cvD)
得到
cvD: 'KNeighborsClassifier': [0.5887365163416062, 0.011300179653818953], 'LogisticRegression': [0.8500902765971645, 0.0036164412715674102], 'AdaBoostClassifier': [0.8583319753215507, 0.004050825383307547], 'GradientBoostingClassifier': [0.8639129158346284, 0.003503053433053003], 'RandomForestClassifier': [0.7803945135123486, 0.010025212199131]} 平均值、方差 标准化处理结果: