# 【程序媛晒83行代码】云栖社区聚能聊专家，背后83行代码的故事

# 分块计算缺失程度(23块）
block_list = [1,5,6,20,24,28,32,36,48,52,54,64,72,76,102,107,111,155,161,166,211,254,278,298]

for i in tqdm(range(len(block_list)-1)):
tmp_df = train_test.iloc[:,block_list[i]-1:block_list[i+1]-1]
print(tmp_df.columns)
tmp_df_T = tmp_df.T
tmp_df['count'] = tmp_df_T.count()
train_test['count_f' + str(block_list[i]) + '_f' +str(block_list[i+1])] = tmp_df['count']
train_test['count_label_f' + str(block_list[i]) + '_f' +str(block_list[i+1]-1)]=np.where(train_test['count_f' + str(block_list[i]) + '_f' +str(block_list[i+1])] > 0,0,1)

# 计算占比(基于date）
for i in tqdm(r_list):
df_dratio = pd.DataFrame()
for j in date_list:
df_tmp = train_test[train_test['date'] == j]
f_ratio = pd.DataFrame(df_tmp[i].value_counts())
f_ratio['date'] = j
f_ratio[i + '_rcount'] = f_ratio[i]
f_ratio[i + '_ratio'] = f_ratio[i]/len(df_tmp)
f_ratio[i] = f_ratio.index
f_ratio = f_ratio.reset_index(drop = True)
df_dratio = pd.concat([df_dratio, f_ratio], axis = 0)
df_dratio = df_dratio.reset_index(drop = True)
train_test = train_test.merge(df_dratio,on = ['date',i],how = 'left')

import copy
def cal_all_altcol_1(col_, labelcol = None, error_corr = False):

#     if labelcol is not None:
#         print("pcsing by groups")

col = copy.deepcopy(col_)
colname = col.columns[0]

if(error_corr == True):
if labelcol is not None:
mean = pd.DataFrame(calmean(col, labelcol))
alt_col = [col, 1 / col, col-mean, np.fabs(col-mean)]
alt_col_name = [colname + "_X", colname + "_1/X", colname + "_Xerr", colname + "_Xerr_abs"]
else:
mean = calmean(col, labelcol)
alt_col = [col, 1 / col, np.fabs(col-mean)]
alt_col_name = [colname + "_X", colname + "_1/X", colname + "_Xerr_abs"]
elif(error_corr == False):
alt_col = [col , 1 / col]
alt_col_name = [colname + "_X", colname + "_1/X"]

alt_list = pd.concat(alt_col, axis = 1)
alt_list.columns = alt_col_name
return alt_list

def cal_all_altcol_1_base(col_, labelcol = None, error_corr = False):
# 只取正负列

#     if labelcol is not None:
#         print("pcsing by groups")

col = copy.deepcopy(col_)
colname = col.columns[0]

if(error_corr == True):
if labelcol is not None:
mean = pd.DataFrame(calmean(col, labelcol))
alt_col = [col, np.fabs(col-mean), col-mean, ]
alt_col_name = [colname + "_X", colname + "_Xerr_abs", colname + "_Xerr"]
else:
mean = pd.DataFrame(calmean(col, labelcol))
print(mean)
alt_col = [col, np.fabs(col-mean)]
alt_col_name = [colname + "_X", colname + "_Xerr_abs"]
elif(error_corr == False):
alt_col = [col]
alt_col_name = [colname + "_X"]

alt_list = pd.concat(alt_col, axis = 1)
#     print (alt_list)
alt_list.columns = alt_col_name
return alt_list

def cal_all_altcol_2(col_i, col_j, labelcol = None, error_corr = False):

#     if labelcol is not None:
#         print("pcsing by groups")
coli = copy.deepcopy(col_i)
colj = copy.deepcopy(col_j)
coliname = coli.columns[0]
coljname = colj.columns[0]
colbet = coliname + "_" + coljname
coli.columns, colj.columns = [colbet], [colbet] # 需要把列名统一不然不能直接减

Plus = coli + colj
Minus = coli - colj
Multiply = coli * colj
Divide_1 = coli / colj
Divide_2 = colj / coli
everynum_perpart = 5

if(error_corr == True):
P_mean = pd.DataFrame(calmean(Plus, labelcol))
Mi_mean = pd.DataFrame(calmean(Minus, labelcol))
Mu_mean = pd.DataFrame(calmean(Multiply, labelcol))
D1_mean = pd.DataFrame(calmean(Divide_1, labelcol))
D2_mean = pd.DataFrame(calmean(Divide_2, labelcol))
if labelcol is not None:
alt_col =  [Plus, Minus, Multiply, Divide_1, Divide_2,
np.fabs(Plus - P_mean), np.fabs(Minus - Mi_mean), np.fabs(Multiply - Mu_mean),
np.fabs(Divide_1 - D1_mean), np.fabs(Divide_2 - D2_mean),
Plus - P_mean, Minus - Mi_mean, Multiply - Mu_mean,Divide_1 - D1_mean, Divide_2 - D2_mean]
alt_col_name = [colbet + "_X+Y", colbet + "_X-Y", colbet + "_X*Y", colbet + "_X/Y", colbet + "_Y/X",
colbet + "_X+Yerr_abs", colbet + "_X-Yerr_abs", colbet + "_X*Yerr_abs",
colbet + "_X/Yerr_abs", colbet + "_Y/Xerr_abs",
colbet + "_X+Yerr", colbet + "_X-Yerr", colbet + "_X*Yerr",
colbet + "_X/Yerr", colbet + "_Y/Xerr",]
elif labelcol is None:
alt_col = [Plus, Minus, Multiply, Divide_1, Divide_2,
np.fabs(Plus - P_mean), np.fabs(Minus - Mi_mean), np.fabs(Multiply - Mu_mean),
np.fabs(Divide_1 - D1_mean), np.fabs(Divide_2 - D2_mean)]
alt_col_name = [colbet + "_X+Y", colbet + "_X-Y", colbet + "_X*Y", colbet + "_X/Y", colbet + "_Y/X",
colbet + "_X+Yerr_abs", colbet + "_X-Yerr_abs", colbet + "_X*Yerr_abs",
colbet + "_X/Yerr_abs", colbet + "_Y/Xerr_abs"]
elif(error_corr == False):
alt_col = [Plus, Minus, Multiply, Divide_1, Divide_2]
alt_col_name = [colbet + "_X+Y", colbet + "_X-Y", colbet + "_X*Y", colbet + "_X/Y", colbet + "_Y/X"]

alt_list = pd.concat(alt_col, axis = 1)
alt_list.columns = alt_col_name
#     print(alt_list)
return alt_list
# def cal_all_alt2tol(coli, colj, )
# cal_all_altcol_2(test, test2)

### 与海洋的云小姐姐互动，为她打call——>点击进去晒码

|
11月前
510特辑 | 读懂阿里日，也就读懂了阿里
510特辑 | 读懂阿里日，也就读懂了阿里
204 0
|
8月前
|

52 0
|

1314 0
|
SQL 分布式计算 前端开发

140 0
|

895 0
|

【云栖精选】帮你把握“金三银四”，阿里开发者招聘节面经总结帖来袭

7355 0
|

|

|

4609 0
|