Pandas数据规整
数据分析和建模方面的大量编程工作都是用在数据准备上的,有时候存放在文件或数据库中的数据并不能满足数据处理应用的要求
Pandas提供了一组高级的、灵活的、高效的核心函数和算法,它们能够轻松地将数据规整化为你需要的形式
合并
连接
Pandas提供了大量方法,能轻松的对Series,DataFrame和Panel执行合并操作
连接pandas对象 .concat()
import numpy as np
import pandas as pd
df = pd.DataFrame(np.random.randn(10, 4))
df.head()
|
0 |
1 |
2 |
3 |
0 |
0.231308 |
1.193636 |
-0.033288 |
0.826399 |
1 |
-0.421474 |
-0.618510 |
-1.266325 |
-0.439435 |
2 |
-0.279457 |
0.578144 |
1.131353 |
-0.639720 |
3 |
-1.197750 |
-0.446579 |
0.495728 |
0.900704 |
4 |
-0.638926 |
-0.233019 |
-1.106248 |
-0.762133 |
pieces = [df[:2], df[3:5], df[7:]]
pieces
[ 0 1 2 3
0 0.231308 1.193636 -0.033288 0.826399
1 -0.421474 -0.618510 -1.266325 -0.439435,
0 1 2 3
3 -1.197750 -0.446579 0.495728 0.900704
4 -0.638926 -0.233019 -1.106248 -0.762133,
0 1 2 3
7 -0.265515 -0.705797 0.695531 -0.257374
8 0.552615 -0.137180 0.859215 -0.853752
9 -1.014105 0.392409 -1.832748 0.612679]
df2 = pd.concat(pieces)
df2
|
0 |
1 |
2 |
3 |
0 |
0.231308 |
1.193636 |
-0.033288 |
0.826399 |
1 |
-0.421474 |
-0.618510 |
-1.266325 |
-0.439435 |
3 |
-1.197750 |
-0.446579 |
0.495728 |
0.900704 |
4 |
-0.638926 |
-0.233019 |
-1.106248 |
-0.762133 |
7 |
-0.265515 |
-0.705797 |
0.695531 |
-0.257374 |
8 |
0.552615 |
-0.137180 |
0.859215 |
-0.853752 |
9 |
-1.014105 |
0.392409 |
-1.832748 |
0.612679 |
追加 .append()
df = pd.DataFrame(np.random.randn(4, 4), columns=['A','B','C','D'])
df
|
A |
B |
C |
D |
0 |
1.295901 |
-0.742636 |
0.873728 |
-0.810075 |
1 |
1.073456 |
0.344627 |
0.156597 |
1.460616 |
2 |
1.696282 |
-1.272457 |
1.226460 |
-1.944458 |
3 |
-0.473047 |
0.147528 |
-0.538231 |
0.125467 |
s = df.iloc[2]
s
A 1.696282
B -1.272457
C 1.226460
D -1.944458
Name: 2, dtype: float64
df.append(s, ignore_index=True)
|
A |
B |
C |
D |
0 |
1.295901 |
-0.742636 |
0.873728 |
-0.810075 |
1 |
1.073456 |
0.344627 |
0.156597 |
1.460616 |
2 |
1.696282 |
-1.272457 |
1.226460 |
-1.944458 |
3 |
-0.473047 |
0.147528 |
-0.538231 |
0.125467 |
4 |
1.696282 |
-1.272457 |
1.226460 |
-1.944458 |
分组
group by()
:一般指以下一个或多个操作步骤
- Splitting 将数据分组
- Applying 对每个分组应用不同的function
- Combining 使用某种数据结果展示结果
df = pd.DataFrame({
'A' : ['foo', 'bar', 'foo', 'bar','foo', 'bar', 'foo', 'foo'],
'B' : ['one', 'one', 'two', 'three','two', 'two', 'one', 'three'],
'C' : np.random.randn(8),
'D' : np.random.randn(8)
})
df
|
A |
B |
C |
D |
0 |
foo |
one |
0.556699 |
1.543716 |
1 |
bar |
one |
-0.905349 |
-0.054870 |
2 |
foo |
two |
1.220397 |
-0.589706 |
3 |
bar |
three |
0.637305 |
-0.046351 |
4 |
foo |
two |
-0.150553 |
-0.889157 |
5 |
bar |
two |
-0.771132 |
0.196547 |
6 |
foo |
one |
0.008275 |
-0.571672 |
7 |
foo |
three |
0.228275 |
-1.164593 |
a = df.groupby('A').sum()
a
|
C |
D |
A |
|
|
bar |
-1.039176 |
0.095325 |
foo |
1.863094 |
-1.671411 |
a = df.groupby('A',as_index=False).sum()
a
|
A |
C |
D |
0 |
bar |
-1.039176 |
0.095325 |
1 |
foo |
1.863094 |
-1.671411 |
b = df.groupby(['A','B']).sum()
b
|
|
C |
D |
A |
B |
|
|
bar |
one |
-0.905349 |
-0.054870 |
three |
0.637305 |
-0.046351 |
two |
-0.771132 |
0.196547 |
foo |
one |
0.564975 |
0.972044 |
three |
0.228275 |
-1.164593 |
two |
1.069844 |
-1.478862 |
b = df.groupby(['A','B'],as_index=False).sum()
b
|
A |
B |
C |
D |
0 |
bar |
one |
-0.905349 |
-0.054870 |
1 |
bar |
three |
0.637305 |
-0.046351 |
2 |
bar |
two |
-0.771132 |
0.196547 |
3 |
foo |
one |
0.564975 |
0.972044 |
4 |
foo |
three |
0.228275 |
-1.164593 |
5 |
foo |
two |
1.069844 |
-1.478862 |