import numpy as np
import pandas as pd
创建DataFrame
In [2]:
df = pd.DataFrame({'col_a': np.arange(10),
'col_b': np.random.randn(10),
'col_c': np.random.choice(['A', 'B', 'C'], 10),
'col_d': np.random.choice([0, 1], 10)})
df.head(5)
# R code:
# df <- data.frame(col_a = 0:9,
# col_b = rnorm(10),
# col_c = sample(c('A', 'B', 'C'), size = 10, replace = TRUE),
# col_d = sample(c(0, 1), size = 10, replace = TRUE),
# stringsAsFactors = FALSE)
# head(df, 5)
Out[2]:
col_a col_b col_c col_d
0 0 0.308520 C 1
1 1 -1.829450 B 1
2 2 -0.710135 C 0
3 3 1.354760 A 0
4 4 -0.581359 A 1
获取DataFrame维度
In [3]:
print(df.shape, df.shape[0], df.shape[1])
# R code:
# dim(df), rnow(df), ncol(df)
(10, 4) 10 4
获取DataFrame列名
In [4]:
df.columns
# R code:
# names(df)
Out[4]:
Index(['col_a', 'col_b', 'col_c', 'col_d'], dtype='object')
数据选取
In [5]:
# 选取前5行数据
df.iloc[:5]
# R code:
# df[1:5, ]
Out[5]:
col_a col_b col_c col_d
0 0 0.308520 C 1
1 1 -1.829450 B 1
2 2 -0.710135 C 0
3 3 1.354760 A 0
4 4 -0.581359 A 1
# 选取col_a和col_b列
df[['col_a', 'col_b']]
# R code:
# df[, c('col_a', 'col_b')]
Out[6]:
col_a col_b
0 0 0.308520
1 1 -1.829450
2 2 -0.710135
3 3 1.354760
4 4 -0.581359
5 5 1.633542
6 6 -0.253950
7 7 1.799087
8 8 0.412991
9 9 0.374330
# 选取前5行和前2列
df.iloc[:5, :2]
# R code:
# df[1:5, 1:2]
Out[7]:
col_a col_b
0 0 0.308520
1 1 -1.829450
2 2 -0.710135
3 3 1.354760
4 4 -0.581359
# 选取单个值(scalar)
df.iat[0, 1]
# R code:
# df[1, 2]
Out[8]:
0.3085196186883713
按条件选取数据
In [9]:
df[(df['col_a'] > 3) & (df['col_b'] < 0)]
# or
# df.query('col_a > 3 & col_b < 0')
# R code:
# df[df$col_a > 3 & df$col_b < 0, ]
Out[9]:
col_a col_b col_c col_d
4 4 -0.581359 A 1
6 6 -0.253950 B 1
In [10]:
df[df['col_c'].isin(['A', 'B'])]
# R code:
# df[df$col_c %in% c('A', 'B'), ]
Out[10]:
col_a col_b col_c col_d
1 1 -1.829450 B 1
3 3 1.354760 A 0
4 4 -0.581359 A 1
5 5 1.633542 B 1
6 6 -0.253950 B 1
7 7 1.799087 A 1
9 9 0.374330 A 0
增加新列
In [11]:
df['col_e'] = df['col_a'] + df['col_b']
df
# df$col_e <- df$col_a + df$col_b
Out[11]:
col_a col_b col_c col_d col_e
0 0 0.308520 C 1 0.308520
1 1 -1.829450 B 1 -0.829450
2 2 -0.710135 C 0 1.289865
3 3 1.354760 A 0 4.354760
4 4 -0.581359 A 1 3.418641
5 5 1.633542 B 1 6.633542
6 6 -0.253950 B 1 5.746050
7 7 1.799087 A 1 8.799087
8 8 0.412991 C 0 8.412991
9 9 0.374330 A 0 9.374330
删除列
In [12]:
# 删除col_e列
df = df.drop(columns='col_e')
df
# R code:
# df <- df[, !names(df) == 'col_e']
Out[12]:
col_a col_b col_c col_d
0 0 0.308520 C 1
1 1 -1.829450 B 1
2 2 -0.710135 C 0
3 3 1.354760 A 0
4 4 -0.581359 A 1
5 5 1.633542 B 1
6 6 -0.253950 B 1
7 7 1.799087 A 1
8 8 0.412991 C 0
9 9 0.374330 A 0
In [13]:
# 删除第一列
df.drop(columns=df.columns[0])
# R code:
# df[, -1]
Out[13]:
col_b col_c col_d
0 0.308520 C 1
1 -1.829450 B 1
2 -0.710135 C 0
3 1.354760 A 0
4 -0.581359 A 1
5 1.633542 B 1
6 -0.253950 B 1
7 1.799087 A 1
8 0.412991 C 0
9 0.374330 A 0
转置
In [14]:
df.T
# R code:
# t(df)
Out[14]:
0 1 2 3 4 5 6 7 8 9
col_a 0 1 2 3 4 5 6 7 8 9
col_b 0.30852 -1.82945 -0.710135 1.35476 -0.581359 1.63354 -0.25395 1.79909 0.412991 0.37433
col_c C B C A A B B A C A
col_d 1 1 0 0 1 1 1 1 0 0
数据类型转换
In [15]:
df['col_a'].astype(str)
# as.character(df$col_a)
Out[15]:
0 0
1 1
2 2
3 3
4 4
5 5
6 6
7 7
8 8
9 9
Name: col_a, dtype: object
转换为类别(categories)/因子(factor)类型
In [16]:
pd.Categorical(df['col_c'])
# factor(df$col_d)
Out[16]:
[C, B, C, A, A, B, B, A, C, A]
Categories (3, object): [A, B, C]
数据汇总
按行进行计算
In [17]:
df[['col_a', 'col_b']].sum(axis=1)
# R code:
# apply(df[, c('col_a', 'col_b')], 1, sum)
Out[17]:
0 0.308520
1 -0.829450
2 1.289865
3 4.354760
4 3.418641
5 6.633542
6 5.746050
7 8.799087
8 8.412991
9 9.374330
dtype: float64
按列进行计算
In [18]:
df[['col_a', 'col_b']].mean(axis=0)
# R code:
# apply(df[, c('col_a', 'col_b')], 2, mean)
Out[18]:
col_a 4.500000
col_b 0.250834
dtype: float64
In [19]:
df[['col_a', 'col_b']].apply(lambda x: x.mean() + 10)
# R code:
# apply(df[, c('col_a', 'col_b')], 2, function(x) mean(x) + 10)
Out[19]:
col_a 14.500000
col_b 10.250834
dtype: float64
数据合并
合并列
In [20]:
df2 = pd.DataFrame({'col_x': np.arange(10),
'col_y': np.arange(10)[::-1]})
df2
Out[20]:
col_x col_y
0 0 9
1 1 8
2 2 7
3 3 6
4 4 5
5 5 4
6 6 3
7 7 2
8 8 1
9 9 0
In [21]:
pd.concat([df, df2], axis=1)
# R code:
# cbind(df, df2)
Out[21]:
col_a col_b col_c col_d col_x col_y
0 0 0.308520 C 1 0 9
1 1 -1.829450 B 1 1 8
2 2 -0.710135 C 0 2 7
3 3 1.354760 A 0 3 6
4 4 -0.581359 A 1 4 5
5 5 1.633542 B 1 5 4
6 6 -0.253950 B 1 6 3
7 7 1.799087 A 1 7 2
8 8 0.412991 C 0 8 1
9 9 0.374330 A 0 9 0
合并行
In [22]:
df3 = pd.DataFrame({'col_a': [-1, -2],
'col_b' : [0, 1],
'col_c': ['B', 'C'],
'col_d': [1, 0]})
df3
Out[22]:
col_a col_b col_c col_d
0 -1 0 B 1
1 -2 1 C 0
In [23]:
pd.concat([df, df3], axis=0, ignore_index=True)
# R code:
# rbind(df, df3)
Out[23]:
col_a col_b col_c col_d
0 0 0.308520 C 1
1 1 -1.829450 B 1
2 2 -0.710135 C 0
3 3 1.354760 A 0
4 4 -0.581359 A 1
5 5 1.633542 B 1
6 6 -0.253950 B 1
7 7 1.799087 A 1
8 8 0.412991 C 0
9 9 0.374330 A 0
10 -1 0.000000 B 1
11 -2 1.000000 C 0