4.3附件一的处理:
1. data_325_all=pd.read_excel('附件一:325个样本数据.xlsx') 2. data_325_all_cao_zuo=data_325_all.iloc[:,0:] 3. data_325_all_cao_zuo
先对数据进行了简单的处理一下。中间的是不变的。
def check_data(data_325_all_cao_zuo,min_values,max_values): if (data_325_all_cao_zuo > max_values) or (data_325_all_cao_zuo < min_values): return np.nan else: return data_325_all_cao_zuo for j in range(data_range.shape[0]): names=data_range.iloc[j,1] data_min=data_range.iloc[j,6] data_max=data_range.iloc[j,7] data_325_all_cao_zuo[names]=data_325_all_cao_zuo[names].apply(lambda x:check_data(x,data_min,data_max)) data_325_all_cao_zuo
data_325_all_cao_zuo.isnull().sum()[data_325_all_cao_zuo.isnull().sum()>0]
4.4 拉以达准则
def three_sigma(df_col): """ df_col:DataFrame数据的某一列 """ rule = (df_col.mean() - 3 * df_col.std() > df_col) | (df_col.mean() + 3 * df_col.std() < df_col) index = np.arange(df_col.shape[0])[rule] out_range_index=[pd.DataFrame(df_col.iloc[index]).columns,pd.DataFrame(df_col.iloc[index]).shape[0]] return out_range_index
# 285 out_range_285_idx=[] for i in range(data.shape[1]): df_col=data.iloc[:,i] out_range_285=three_sigma(df_col) out_range_285_idx.append(out_range_285) out_range_285_idx
# 计算符合数据的个数 counts=0 for m in range(len(out_range_285_idx)): if out_range_285_idx[m][1]==0: counts+=1 else: counts+=0 counts
354
counts=0 for n in range(len(out_range_313_idx)): if out_range_313_idx[n][1]==0: counts+=1 else: counts+=0 counts
313
找出异常数据。
# 找出异常数据 index_313=[] for k in range(354): if out_range_313_idx[k][1]!=0: index_313.append((out_range_313_idx[k][0],out_range_313_idx[k][1])) index_313
处理完的数据:
链接: https://pan.baidu.com/s/11OL6B3d3FV8oJ2aQlBK3Kg 提取码: 8u4u
4.5 缺失值的处理
首先计算各位点数据的缺失值比率。将计算值与缺失值比率的阈值(20%)相比,按照其是否超过阈值将缺失数据分为两类:
(1)缺失值比率低的数据;
(2)数据缺失值比率高的数据。
import numpy as np import pandas as pd import matplotlib.pyplot as plt data_285=pd.read_excel('附件三:285号和313号样本原始数据.xlsx',sheet_name='操作变量285') data_285=data_285.iloc[:,1:] data_285
data_313=pd.read_excel('附件三:285号和313号样本原始数据.xlsx',sheet_name='操作变量313') data_313=data_313.iloc[:,1:] data_313
检查不符合3σ原则的数据,并标记为空值
def three_sigma(data_input): for i in range(data_input.shape[0]): for j in range(data_input.shape[1]): mean=data_input.iloc[:,j].mean() std=data_input.iloc[:,j].std() if abs(data_input.iloc[i,j]-mean)>3*std: data_input.iloc[i,j]=np.nan else: continue return data_input
我们看一下313的数据集:
1. data_313_2=three_sigma(data_313) 2. data_313_2
data_313_2.isnull().sum()[data_313_2.isnull().sum()>0]
第一列为索引位置,我们检查一下空值的位置:
isnull=[] for i in data_313_2.columns: for j in data_313_2.index: if data_313_2.isnull().loc[j,i]: isnull.append((j,i)) isnull,len(isnull) # 检查一下空值的位置 第一列为索引位置
尝试查看一个:
1. data_313_2.loc[37,'S-ZORB.FC_2801.PV'] 2. # 尝试一个
nan
from scipy.interpolate import lagrange #传入存在缺失值的列,缺失值所在0轴坐标index,按前后k个数来计算拉格朗日插值,返回index的拉格朗日插值 def lag_fill(df,i,k): r=0 if (i-k)<0 else (i-k) # python的三目运算符较为特殊 l=len(df.index) if (i+1+k)>len(df.index) else (i+1+k) y=df.loc[list(range(r,i))+list(range(i+1,l))] #取index前后k个数据作为y代入拉格朗日函数进行拟合 for j in y.index: if y.isnull().loc[j]: y.drop(index=j,inplace=True) x=y.index lag=lagrange(x.values,y.values) return lag(i)
for i in isnull: fnum=lag_fill(data_313_2.loc[:,i[1]],i[0],1) data_313_2.loc[i[0],i[1]]=fnum
我们检验一下新数据据是否合适:
1. # 用3sigma 函数在检验一下 2. data_313_2_new=three_sigma(data_313_2) 3. data_313_2_new
data_313_2_new.isnull().sum()[data_313_2_new.isnull().sum()>0]
isnull_2=[] for i in data_313_2_new.columns: for j in data_313_2_new.index: if data_313_2_new.isnull().loc[j,i]: isnull_2.append((j,i)) isnull_2,len(isnull_2)
for j in isnull_2: fnum_1=lag_fill(data_313_2_new.loc[:,j[1]],j[0],1) data_313_2_new.loc[j[0],j[1]]=fnum_1
再次检查:
1. data_313_2_new_2=three_sigma(data_313_2_new) 2. data_313_2_new_2.isnull().sum()[data_313_2_new_2.isnull().sum()>0]
isnull_3=[] for i in data_313_2_new_2.columns: for j in data_313_2_new_2.index: if data_313_2_new_2.isnull().loc[j,i]: isnull_3.append((j,i)) isnull_3,len(isnull_3)
for m in isnull_3: fnum_2=lag_fill(data_313_2_new_2.loc[:,m[1]],m[0],1) data_313_2_new_2.loc[m[0],m[1]]=fnum_2
isnull_4=[] for i in data_313_2_new_3.columns: for j in data_313_2_new_3.index: if data_313_2_new_3.isnull().loc[j,i]: isnull_4.append((j,i)) isnull_4,len(isnull_4)
for n in isnull_4: fnum_3=lag_fill(data_313_2_new_3.loc[:,n[1]],n[0],1) data_313_2_new_3.loc[n[0],n[1]]=fnum_3 data_313_2_new_4=three_sigma(data_313_2_new_3) data_313_2_new_4.isnull().sum()[data_313_2_new_4.isnull().sum()>0]
至此,数据处理结束。