Pandas缺失数据处理
Pandas用np.nan代表缺失数据
import pandas as pd
import numpy as np
dates = pd.date_range('20130101',periods=10)
df = pd.DataFrame(np.random.randn(10,4),index=dates,columns=['A','B','C','D'])
df.head()
|
A |
B |
C |
D |
2013-01-01 |
-0.031531 |
1.231280 |
-1.069298 |
1.068172 |
2013-01-02 |
-0.216581 |
0.535341 |
-1.408095 |
0.677334 |
2013-01-03 |
0.262541 |
-0.034165 |
0.712012 |
0.053880 |
2013-01-04 |
0.142971 |
-0.009381 |
-0.369560 |
2.142902 |
2013-01-05 |
-0.483484 |
1.896420 |
-1.087918 |
-0.608670 |
reindex()
可以修改 索引,会返回一个数据的副本:
df1 = df.reindex(index = dates[0:4], columns = ['A', 'B', 'C', 'D', 'E'])
df1
|
A |
B |
C |
D |
E |
2013-01-01 |
-0.031531 |
1.231280 |
-1.069298 |
1.068172 |
NaN |
2013-01-02 |
-0.216581 |
0.535341 |
-1.408095 |
0.677334 |
NaN |
2013-01-03 |
0.262541 |
-0.034165 |
0.712012 |
0.053880 |
NaN |
2013-01-04 |
0.142971 |
-0.009381 |
-0.369560 |
2.142902 |
NaN |
df2 = df.reindex(index=dates[0:4], columns=['A','B','C','D']+['E'])
df2
|
A |
B |
C |
D |
E |
2013-01-01 |
-0.031531 |
1.231280 |
-1.069298 |
1.068172 |
NaN |
2013-01-02 |
-0.216581 |
0.535341 |
-1.408095 |
0.677334 |
NaN |
2013-01-03 |
0.262541 |
-0.034165 |
0.712012 |
0.053880 |
NaN |
2013-01-04 |
0.142971 |
-0.009381 |
-0.369560 |
2.142902 |
NaN |
df3 = df.reindex(index=dates[0:4], columns=list(df.columns) + ['E'])
df3
|
A |
B |
C |
D |
E |
2013-01-01 |
-0.031531 |
1.231280 |
-1.069298 |
1.068172 |
NaN |
2013-01-02 |
-0.216581 |
0.535341 |
-1.408095 |
0.677334 |
NaN |
2013-01-03 |
0.262541 |
-0.034165 |
0.712012 |
0.053880 |
NaN |
2013-01-04 |
0.142971 |
-0.009381 |
-0.369560 |
2.142902 |
NaN |
df3.loc[dates[0]:dates[1],'E'] = 1
df3
|
A |
B |
C |
D |
E |
2013-01-01 |
-0.031531 |
1.231280 |
-1.069298 |
1.068172 |
1.0 |
2013-01-02 |
-0.216581 |
0.535341 |
-1.408095 |
0.677334 |
1.0 |
2013-01-03 |
0.262541 |
-0.034165 |
0.712012 |
0.053880 |
NaN |
2013-01-04 |
0.142971 |
-0.009381 |
-0.369560 |
2.142902 |
NaN |
对缺失值进行填充
df1.fillna(value=5)
|
A |
B |
C |
D |
E |
2013-01-01 |
-0.031531 |
1.231280 |
-1.069298 |
1.068172 |
5.0 |
2013-01-02 |
-0.216581 |
0.535341 |
-1.408095 |
0.677334 |
5.0 |
2013-01-03 |
0.262541 |
-0.034165 |
0.712012 |
0.053880 |
5.0 |
2013-01-04 |
0.142971 |
-0.009381 |
-0.369560 |
2.142902 |
5.0 |
df2['E'] = df1['E'].fillna(value=5)
df2
|
A |
B |
C |
D |
E |
2013-01-01 |
-0.031531 |
1.231280 |
-1.069298 |
1.068172 |
5.0 |
2013-01-02 |
-0.216581 |
0.535341 |
-1.408095 |
0.677334 |
5.0 |
2013-01-03 |
0.262541 |
-0.034165 |
0.712012 |
0.053880 |
5.0 |
2013-01-04 |
0.142971 |
-0.009381 |
-0.369560 |
2.142902 |
5.0 |
丢掉含有缺失项的行:
df3.dropna(how = 'any')
|
A |
B |
C |
D |
E |
2013-01-01 |
-0.031531 |
1.231280 |
-1.069298 |
1.068172 |
1.0 |
2013-01-02 |
-0.216581 |
0.535341 |
-1.408095 |
0.677334 |
1.0 |
对缺失项布尔赋值
df4 = df1.isnull()
df4
|
A |
B |
C |
D |
E |
2013-01-01 |
False |
False |
False |
False |
True |
2013-01-02 |
False |
False |
False |
False |
True |
2013-01-03 |
False |
False |
False |
False |
True |
2013-01-04 |
False |
False |
False |
False |
True |
df5 = pd.isnull(df1)
df5
|
A |
B |
C |
D |
E |
2013-01-01 |
False |
False |
False |
False |
True |
2013-01-02 |
False |
False |
False |
False |
True |
2013-01-03 |
False |
False |
False |
False |
True |
2013-01-04 |
False |
False |
False |
False |
True |