前言
本篇承接上篇
分箱逻辑比较复杂 设计到很多的算法
为了确保分析的准确性 我通过pycharm编译器Debug的方式跑这段代码
一步一步的分析代码的实现逻辑
编译器环境的准备
python代码准备
python依赖包安装
pip3 install numpy pip3 install pandas pip3 install matplotlib==3.2.0 pip3 install --target=./venv/lib/python3.7/site-packages seaborn pip3 install ipython pip3 install xlrd pip3 install sklearn
代码逻辑分析
自动分箱
# 调用自定义分箱 dfx1, ivx1, cutx1, woex1 = mono_bin(train.SeriousDlqin2yrs, train.RevolvingUtilizationOfUnsecuredLines, n=10) # 自定义自动分箱函数 def mono_bin(Y, X, n=20): r = 0 # 好人个数 6936 good = Y.sum() # 坏人个数 94957 bad = Y.count() - good # np.abs(0)=0.9272727272727272m absR = np.abs(r) while absR < 1: # 满足循环条件 进入循环体 # 先对训练集X排序 再分组 dfX = X.rank(method="first") cutPd = pd.qcut(dfX, n) d1 = pd.DataFrame({"X": X, "Y": Y, "Bucket": cutPd}) # X.rank(method="first") d2 = d1.groupby("Bucket", as_index = True) d2Mean = d2.mean() d2MeanX = d2Mean.X r, p = stats.spearmanr(d2MeanX, d2.mean().Y) # 使用斯皮尔曼等级相关系数来评估两个变量之间的相关性 n = n - 1 d3 = pd.DataFrame(d2.X.min(), columns = ['min']) d3['min']=d2.min().X d3['max'] = d2.max().X d3['sum'] = d2.sum().Y d3['total'] = d2.count().Y d3['rate'] = d2.mean().Y d3['woe'] = np.log((d3['rate'] / (1-d3['rate'])) / (good/bad)) d3['goodattribute'] = d3['sum'] / good d3['badattribute'] = (d3['total'] - d3['sum']) / bad iv = ((d3['goodattribute'] - d3['badattribute']) * d3['woe']).sum() d4 = (d3.sort_values(by = 'min')) print(d4) cut=[] cut.append(float('-inf')) for i in range(1, n+1): qua = X.quantile(i/(n+1)) cut.append(round(qua,4)) cut.append(float('inf')) woe = list(d4['woe'].round(3)) return d4, iv, cut, woe