import pandas as pd
read_table(filepath_or_buffer[, sep, …])
Read general delimited file into DataFrame.
read_csv(filepath_or_buffer[, sep, …])
Read a comma-separated values (csv) file into DataFrame.
read_fwf(filepath_or_buffer[, colspecs, …])
Read a table of fixed-width formatted lines into DataFrame.
# 要注意,read_csv只能读取csv格式的数据,而且路径需要用的是/,不能直接在文件夹中复制路径然后黏贴上去 df = pd.read_csv('./data/titanic.csv')
# 读取前5条数据 df.head()
# 读取最后的5条数据 df.tail()
# 打印当前读取数据的部分信息,包括数据样本规模,每列特征类型与个数,整体的内存占用等 # 其中的行表述数据样本,列表示每一个特征指标,读回来的数据基本上返回的都是df结构 df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 891 entries, 0 to 890 Data columns (total 12 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 PassengerId 891 non-null int64 1 Survived 891 non-null int64 2 Pclass 891 non-null int64 3 Name 891 non-null object 4 Sex 891 non-null object 5 Age 714 non-null float64 6 SibSp 891 non-null int64 7 Parch 891 non-null int64 8 Ticket 891 non-null object 9 Fare 891 non-null float64 10 Cabin 204 non-null object 11 Embarked 889 non-null object dtypes: float64(2), int64(5), object(5) memory usage: 83.7+ KB
# 返回索引 df.index
RangeIndex(start=0, stop=891, step=1)
# 拿到每一个特征的名字 df.columns
Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'], dtype='object')
# 每一列的类型,其中object表示python中的字符串,而numpy中的object也是表示字符串 df.dtypes
PassengerId int64 Survived int64 Pclass int64 Name object Sex object Age float64 SibSp int64 Parch int64 Ticket object Fare float64 Cabin object Embarked object dtype: object
# 直接取得数值矩阵 df.values
array([[1, 0, 3, ..., 7.25, nan, 'S'], [2, 1, 1, ..., 71.2833, 'C85', 'C'], [3, 1, 3, ..., 7.925, nan, 'S'], ..., [889, 0, 3, ..., 23.45, nan, 'S'], [890, 1, 1, ..., 30.0, 'C148', 'C'], [891, 0, 3, ..., 7.75, nan, 'Q']], dtype=object)
# 数据索引 age = df['Age'] # 获得单个数据 print("single:\n",age[0]) # 获得前x个数据 print("more:\n",age[:5]) # 单独把结果拿出来 age.values[:5]
single: 22.0 more: 0 22.0 1 38.0 2 26.0 3 35.0 4 35.0 Name: Age, dtype: float64 array([22., 38., 26., 35., 35.])
# 默认数字为索引 df.head()
# 将名字设置为索引,reset_index()是还原索引 df = df.set_index('Name') df.head()
# 此时便可以通过岁数来查找年龄 age = df['Age'] print("Braund, Mr. Owen Harris age:",age['Braund, Mr. Owen Harris']) print("Futrelle, Mrs. Jacques Heath (Lily May Peel) age:",age['Futrelle, Mrs. Jacques Heath (Lily May Peel)'])
Braund, Mr. Owen Harris age: 22.0 Futrelle, Mrs. Jacques Heath (Lily May Peel) age: 35.0
# 还原索引 df = df.reset_index() # 设置多个数据 Clichong = df[['Name','Age','Fare']] Clichong[:5]
# iloc例子 # 可以拿到一个数据 print("df.iloc[2]",df.iloc[2]) # 可以拿到一片数据 print("\n df.iloc[1:4]",df.iloc[1:4]) # 不仅可以指定样本,还可以指定特征 print("\n df.iloc[1:4,1:3]",df.iloc[1:4,1:3])
df.iloc[2] index 2 Name Heikkinen, Miss. Laina PassengerId 3 Survived 1 Pclass 3 Sex female Age 26 SibSp 0 Parch 0 Ticket STON/O2. 3101282 Fare 7.925 Cabin NaN Embarked S Name: 2, dtype: object df.iloc[1:4] index Name PassengerId \ 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... 2 2 2 Heikkinen, Miss. Laina 3 3 3 Futrelle, Mrs. Jacques Heath (Lily May Peel) 4 Survived Pclass Sex Age SibSp Parch Ticket Fare \ 1 1 1 female 38.0 1 0 PC 17599 71.2833 2 1 3 female 26.0 0 0 STON/O2. 3101282 7.9250 3 1 1 female 35.0 1 0 113803 53.1000 Cabin Embarked 1 C85 C 2 NaN S 3 C123 S df.iloc[1:4,1:3] Name PassengerId 1 Cumings, Mrs. John Bradley (Florence Briggs Th... 2 2 Heikkinen, Miss. Laina 3 3 Futrelle, Mrs. Jacques Heath (Lily May Peel) 4
# loc例子 # 取出Heikkinen, Miss. Laina的全部信息 df.loc[2]
Name Heikkinen, Miss. Laina index 2 PassengerId 3 Survived 1 Pclass 3 Sex female Age 26 SibSp 0 Parch 0 Ticket STON/O2. 3101282 Fare 7.925 Cabin NaN Embarked S Name: 2, dtype: object
# 取出Heikkinen, Miss. Laina的性别信息 df.loc[2,'Sex']
# 截取部分人的全部信息 df.loc[0:2]
# 截取部分人的部分信息 df.loc[0:2,'Name']
0 Braund, Mr. Owen Harris 1 Cumings, Mrs. John Bradley (Florence Briggs Th... 2 Heikkinen, Miss. Laina Name: Name, dtype: object
# 设置索引,找出全部男性 df = df.set_index('Name') #重复设置会报错 df['Sex'] == 'male'
Name Braund, Mr. Owen Harris True Cumings, Mrs. John Bradley (Florence Briggs Thayer) False Heikkinen, Miss. Laina False Futrelle, Mrs. Jacques Heath (Lily May Peel) False Allen, Mr. William Henry True ... Montvila, Rev. Juozas True Graham, Miss. Margaret Edith False Johnston, Miss. Catherine Helen "Carrie" False Behr, Mr. Karl Howell True Dooley, Mr. Patrick True Name: Sex, Length: 891, dtype: bool
# 找出前五个的男性 df[df['Sex'] == 'male'][0:5]
# 计算所有男性乘客的平均年龄 df.loc[df['Sex'] == 'male','Age'].mean()
0 22.0 4 35.0 5 NaN 6 54.0 7 2.0 ... 883 28.0 884 25.0 886 27.0 889 26.0 890 32.0 Name: Age, Length: 577, dtype: float64
df[‘Sex’] == ‘male’ :为晒选出男性
df.loc[df[‘Sex’] == ‘male’,‘Age’] :列举出全部男性的年龄
df.loc[df[‘Sex’] == ‘male’,‘Age’].mean : 求出平均年龄
# 大于70岁的乘客的信息 df[df['Age'] >= 70]
# 计算大于70岁的乘客的人数 (df['Age'] >= 70).sum()
# 计算大于70岁的乘客的总岁数 df.loc[df['Age'] >= 70,'Age'].sum()
data = {'country':['China','America','India'],'population':[14,5,25]} df = pd.DataFrame(data) df
df[df['population'] >= 14]
参数设置 set_option
# 设置最大显示行数与列数 # pd.set_option('display.max_columns',20) pd.set_option('display.max_rows',6) df
891 rows × 12 columns
# 查看最大显示的列数与行数 print("pd.get_option('display.max_rows'):",pd.get_option('display.max_rows')) print("pd.get_option('display.max_columns'):",pd.get_option('display.max_columns'))
pd.get_option('display.max_rows'): 6 pd.get_option('display.max_columns'): 20
# 创建Series结构类型 data = [1,2,3] index = ['a','b','c'] Clichong = pd.Series(data = data,index = index) Clichong
a 1 b 2 c 3 dtype: int64
Clichong_cp = Clichong.copy() Clichong_cp['b']
# inplace如果为False则表示不将结果赋值给变量,只相当于打印操作;如果为True,就直接表示在数据中执行实际变换 Clichong.replace(to_replace=1,value=100,inplace=True) Clichong
a 100 b 2 c 3 dtype: int64
Index(['a', 'b', 'c'], dtype='object')
# 改变索引 Clichong.rename(index={'a':'A'},inplace=True) Clichong.index
Index(['A', 'b', 'c'], dtype='object')
# 增加数据的方式 # 方法1:直接赋值 Clichong['d'] = 4 Clichong
A 100 b 2 c 3 d 4 dtype: int64
# 方法2:追加另外一个series,其中的ignore_index可以重新设置索引 data = [5,6] index = ['e','f'] Clichong2 = pd.Series(data=data,index=index) Clichong = Clichong.append(Clichong2,ignore_index=True) Clichong
0 100 1 2 2 3 ... 5 6 6 5 7 6 Length: 8, dtype: int64
# 方法1:del删除 del Clichong[1] Clichong
2 3 3 4 4 5 5 6 6 5 7 6 dtype: int64
# 方法2:drop删除 Clichong.drop([7],inplace=True) Clichong
4 5 5 6 6 5 dtype: int64
# 基本的一些统计方法都是类似的,max,min,median,sum等待 df['Age'].mean()
# 观察索引样本的函数 pd.set_option('display.max_rows',8) # 设置最大显示行数为8 df.describe()
# 协方差矩阵 df.cov()
# 相关系数 df.corr()
# 统计某一I列属性的比例情况 df['Sex'].value_counts() # 默认多的排前面 # 还可以让少的排前面 df['Sex'].value_counts(ascending = True)
female 314 male 577 Name: Sex, dtype: int64
# 对年龄段划分为几个组,bins是划分的组数 df['Age'].value_counts(ascending = True,bins = 5)
(64.084, 80.0] 11 (48.168, 64.084] 69 (0.339, 16.336] 100 (32.252, 48.168] 188 (16.336, 32.252] 346 Name: Age, dtype: int64
data = [1,2,3,4,5,6,7,8,9,0] bins = [0,3,7,10] Clichong = pd.cut(data,bins) Clichong
[(0.0, 3.0], (0.0, 3.0], (0.0, 3.0], (3.0, 7.0], (3.0, 7.0], (3.0, 7.0], (3.0, 7.0], (7.0, 10.0], (7.0, 10.0], NaN] Categories (3, interval[int64]): [(0, 3] < (3, 7] < (7, 10]]
(3, 7] 4 (0, 3] 3 (7, 10] 2 dtype: int64
# 对某一种类型进行自定义的分类,结合起来用,要弄清楚每一小块的含义 pd.value_counts(pd.cut(df['Age'],[10,30,50,80]))
(10, 30] 345 (30, 50] 241 (50, 80] 64 Name: Age, dtype: int64
# 分类好的数据可以用value_counts来进行统计 pd.cut(df['Age'],[10,30,50,80],labels = group_name)
0 Young 1 Mille 2 Young 3 Mille ... 887 Young 888 NaN 889 Young 890 Mille Name: Age, Length: 891, dtype: category Categories (3, object): [Young < Mille < Old]
# 还可以自定义标签,利用cut中的labels参数 group_name = ['Young','Mille','Old'] pd.value_counts(pd.cut(df['Age'],[10,30,50,80],labels = group_name))
Young 345 Mille 241 Old 64 Name: Age, dtype: int64
example = pd.DataFrame({'Month': ["January", "January", "January", "January", "February", "February", "February", "February", "March", "March", "March", "March"], 'Category': ["Transportation", "Grocery", "Household", "Entertainment", "Transportation", "Grocery", "Household", "Entertainment", "Transportation", "Grocery", "Household", "Entertainment"], 'Amount': [74., 235., 175., 100., 115., 240., 225., 125., 90., 260., 200., 120.]}) example
12 rows × 3 columns
# pivot是数据透析表,可以按照自己的方式来分析数据 example_pivot = example.pivot(index = 'Category',columns= 'Month',values = 'Amount') example_pivot
# 统计行项综合 example_pivot.sum(axis=1)
Category Entertainment 345.0 Grocery 735.0 Household 600.0 Transportation 279.0 dtype: float64
# 统计列项总和 example_pivot.sum(axis=0)
Month February 705.0 January 584.0 March 670.0 dtype: float64
# 指定最大值而不是平均值 df.pivot_table(index='Sex',columns='Pclass',values='Fare',aggfunc='max') # or min
# 想统计个船舱等级的人数 df.pivot_table(index='Sex',columns='Pclass',values='Fare',aggfunc='count')
# 进一步统计人数,不分男女 (df.pivot_table(index='Sex',columns='Pclass',values='Fare',aggfunc='count')).sum(axis=0)
Pclass 1 216 2 184 3 491 dtype: int64
# 将乘客分成两组:成年人与为成年人,再对着两组乘客分别统计不同性别人的平均获救可能性,mean为平均值,虽然默认也是平均值 df['Underaged'] = df['Age'] <= 18 df.pivot_table(index = 'Underaged',columns='Sex',values='Survived',aggfunc='mean')
df = pd.DataFrame({'key':['A','B','C','A','B','C','A','B','C'], 'data':[0,5,10,5,10,15,10,15,20]}) df
9 rows × 2 columns
# 统计key的取值的常见代码 for key in ['A','B','C']: print(key,df[df['key'] == key].sum())
A key AAA data 15 dtype: object B key BBB data 30 dtype: object C key CCC data 45 dtype: object
# groupby可以代替以上代码,默认为累加值 df.groupby('key').sum()
# 可以更改为均值等指标 df.groupby('key').aggregate(np.mean)
# 按照不同行呗统计年龄的平均值 df.groupby('Sex')['Age'].mean()
Sex female 27.915709 male 30.726645 Name: Age, dtype: float64
# 按照性别统计存活率 df.groupby('Sex')['Survived'].mean()
Sex female 0.742038 male 0.188908 Name: Survived, dtype: float64
# 数据集 df = pd.DataFrame({'A' : ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'foo'], 'B' : ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'], 'C' : np.random.randn(8), 'D' : np.random.randn(8)}) df
# 调用count属性 print("mean:\n",df.groupby('A').mean()) print("\ncount1:\n",df.groupby('A').count()) print("\ncount2:\n",df.groupby(['A','B']).count())
mean: C D A bar -0.241736 0.790723 foo -0.222434 -0.466558 count1: B C D A bar 3 3 3 foo 5 5 5 count2: C D A B bar one 1 1 three 1 1 two 1 1 foo one 2 2 three 1 1 two 2 2
# 与numpy结合,设置一下统计的方法 # df.groupby(['A','B']).aggregate(np.sum) grouped = df.groupby(['A','B']) grouped.aggregate(np.sum)
# 通过as_index参数来增加索引 # df.groupby(['A','B'],as_index = False).aggregate(np.sum) grouped = df.groupby(['A','B'],as_index = False) grouped.aggregate(np.sum)
# 通过describe方法来展示所有的统计信息 grouped.describe().head()
# 设置自己需要的统计指标 grouped = df.groupby('A') grouped['C'].agg([np.sum,np.mean,np.std])
left = pd.DataFrame({'key':['K0','K1','K2','K3'], 'A':['A0','A1','A2','A3'], 'B':['B0','B1','B2','B3']}) right = pd.DataFrame({'key':['K0','K1','K2','K3'], 'C':['C0','C1','C2','C3'], 'D':['D0','D1','D2','D3']})
left = pd.DataFrame({'key1':['K0','K1','K2','K3'], 'key2':['K0','K1','K2','K3'], 'A':['A0','A1','A2','A3'], 'B':['B0','B1','B2','B3']}) right = pd.DataFrame({'key1':['K0','K1','K2','K3'], 'key2':['K0','K1','K2','K4'], 'C':['C0','C1','C2','C3'], 'D':['D0','D1','D2','D3']})
# 发现有部分数据被丢弃 pd.merge(left,right,on=['key1','key2'])
# 考虑所有的结果how pd.merge(left,right,on=['key1','key2'],how='outer')
# 再加入详细的说明indicator pd.merge(left,right,on=['key1','key2'],how='outer',indicator = True)
# 或者只考虑一边,比如只考虑左边 pd.merge(left,right,on=['key1','key2'],how='left')
# 或者只考虑一边,比如只考虑右边 pd.merge(left,right,on=['key1','key2'],how='right')
data = pd.DataFrame({'group':['a','a','a','b','b','b','c','c','c'], 'data':[4,3,2,1,12,3,4,5,7]}) data
9 rows × 2 columns
data.sort_values(by=['group','data'],ascending = [False,True],inplace=True) data
9 rows × 2 columns
data = pd.DataFrame({'k1':['one']*3+['two']*4, 'k2':[3,2,1,3,3,4,4]}) data
# 排序操作:降序 data.sort_values(by='k2',ascending = False)
# 去除相同的数据 data.drop_duplicates()
# 只考虑某一列的重复情况,其他全部舍弃 data.drop_duplicates(subset='k1')
# 往数据里添加新列,使用assign函数 df = pd.DataFrame({'data1':np.random.randn(5),'data2':np.random.randn(5)}) df2 = df.assign(ration = df['data1']/df['data2']) df2
# 通过isnull判断缺失值 df = pd.DataFrame([range(3),[0, np.nan,0],[0,0,np.nan],range(3)]) df.isnull()
# 如果数据太多,可以按列或者是行来判断是否有缺失值 # 按列来判断 df.isnull().any()
0 False 1 True 2 True dtype: bool
# 按行来判断 df.isnull().any(axis = 1)
0 False 1 True 2 True 3 False dtype: bool
# 填充确实值 df.fillna(100)
data = pd.DataFrame({'food':['A1','A2','B1','B2','B3','C1','C2'],'data':[1,2,3,4,5,6,7]}) data
# 使用apply函数进行映射 def food_map(series): if series['food'] == 'A1': return 'A' elif series['food'] == 'A2': return 'A' elif series['food'] == 'B1': return 'B' elif series['food'] == 'B2': return 'B' elif series['food'] == 'B3': return 'B' elif series['food'] == 'C1': return 'C' elif series['food'] == 'C2': return 'C' data['food_map'] = data.apply(food_map,axis = 'columns') data
# 使用map函数进行映射 food2Upper = { 'A1':'a1', 'A2':'a2', 'B1':'b1', 'B2':'b2', 'B3':'b3', 'C1':'c1', 'C2':'c2' } data['upper'] = data['food'].map(food2Upper) data
# 创建一个时间戳 ts = pd.Timestamp('2021-1-21') ts
Timestamp('2021-01-21 00:00:00')
# 打印月份 print("ts.month:",ts.month) print("ts.month_name",ts.month_name())
ts.month: 1 ts.month_name January
# 打印天数 print("ts.day:",ts.day) print("ts.day_name",ts.day_name())
ts.day: 21 ts.day_name Thursday
s = pd.Series(['2017-11-24 00:00:00','2017-11-25 00:00:00','2017-11-26 00:00:00']) s
0 2017-11-24 00:00:00 1 2017-11-25 00:00:00 2 2017-11-26 00:00:00 dtype: object
# 转换成标准格式 ts = pd.to_datetime(s) ts
0 2017-11-24 1 2017-11-25 2 2017-11-26 dtype: datetime64[ns]
ts.dt.hour 1 0 0 1 0 2 0 dtype: int64
0 4 1 5 2 6 dtype: int64
# 创建自己的时间特征,每条数据按固定的时间保存下来 pd.Series(pd.date_range(start='2017-11-24',periods = 10,freq = '12H'))
0 2017-11-24 00:00:00 1 2017-11-24 12:00:00 2 2017-11-25 00:00:00 3 2017-11-25 12:00:00 ... 6 2017-11-27 00:00:00 7 2017-11-27 12:00:00 8 2017-11-28 00:00:00 9 2017-11-28 12:00:00 Length: 10, dtype: datetime64[ns]
# 如果以时间特征为索引,可以将parse_dates参数设置为True df = pd.read_csv('./data/flowdata.csv',index_col=0,parse_dates=True) df
11697 rows × 3 columns
# 有了索引就可以去数据 df[pd.Timestamp('2012-01-01 09:00'):pd.Timestamp('2012-01-11 19:00')]
# 取2012年的数据 df['2012']
2928 rows × 3 columns
# 指定具体月份 df['2013-01':'2013-01']
9 rows × 3 columns
# 进行细致的判断 df[(df.index.hour > 8) & (df.index.hour < 12) & (df.index.day == 1) & (df.index.month == 1)]
# mean表示去平均值,head表示取前五条数据,resample重采样统计了每天的平均指标(默认是1条为1个周期) df.resample('D').mean().head()
# 设置为10天一个周期 df.resample('10D').mean().head()
# 按月来进行统计 df.resample('M').mean().head()
%matplotlib inline df = pd.DataFrame(np.random.randn(10, 4).cumsum(0), index = np.arange(0, 100, 10), columns = ['A', 'B', 'C', 'D']) df.plot() df
10 rows × 4 columns
# 可以指定绘图的种类,比如散点图,条形图等等 df = pd.DataFrame(np.random.rand(6, 4), index = ['one', 'two', 'three', 'four', 'five', 'six'], columns = pd.Index(['A', 'B', 'C', 'D'], name = 'Genus')) df.plot(kind='bar') df
tips = pd.read_csv('./tips.csv') tips.total_bill.plot(kind='hist',bins=50) tips
244 rows × 7 columns
df = pd.read_csv('./macrodata.csv') data = df[['quarter','realgdp','realcons']] data.plot.scatter('quarter','realgdp') data
203 rows × 3 columns
# 由于样本的较大,所以加载进来都卡顿了一会 gl = pd.read_csv('./game_logs.csv') gl.head()
5 rows × 161 columns
# 样本数据 gl.shape
(171907, 161)
# 指定成deep表示要详细展示当前数据占用的内存(执行的时候卡顿了好久) gl.info(memory_usage='deep')
<class 'pandas.core.frame.DataFrame'> RangeIndex: 171907 entries, 0 to 171906 Columns: 161 entries, date to acquisition_info dtypes: float64(77), int64(6), object(78) memory usage: 860.5 MB
for dtype in ['float64','object','int64']: selected_dtype = gl.select_dtypes(include=[dtype]) mean_usage_b = selected_dtype.memory_usage(deep=True).mean() mean_usage_mb = mean_usage_b / 1024 ** 2 print("Average memory usage for {} columns: {:03.2f} MB".format(dtype,mean_usage_mb))
Average memory usage for float64 columns: 1.29 MB Average memory usage for object columns: 9.51 MB Average memory usage for int64 columns: 1.12 MB
int_types = ["uint8", "int8", "int16","int32","int64"] for it in int_types: print(np.iinfo(it))
Machine parameters for uint8 --------------------------------------------------------------- min = 0 max = 255 --------------------------------------------------------------- Machine parameters for int8 --------------------------------------------------------------- min = -128 max = 127 --------------------------------------------------------------- Machine parameters for int16 --------------------------------------------------------------- min = -32768 max = 32767 --------------------------------------------------------------- Machine parameters for int32 --------------------------------------------------------------- min = -2147483648 max = 2147483647 --------------------------------------------------------------- Machine parameters for int64 --------------------------------------------------------------- min = -9223372036854775808 max = 9223372036854775807 ---------------------------------------------------------------
def mem_usage(pandas_obj): if isinstance(pandas_obj,pd.DataFrame): usage_b = pandas_obj.memory_usage(deep=True).sum() else: # we assume if not a df it's a series usage_b = pandas_obj.memory_usage(deep=True) usage_mb = usage_b / 1024 ** 2 # convert bytes to megabytes return "{:03.2f} MB".format(usage_mb) gl_int = gl.select_dtypes(include=['int64']) # http://pandas.pydata.org/pandas-docs/stable/generated/pandas.to_numeric.html converted_int = gl_int.apply(pd.to_numeric,downcast='unsigned') print(mem_usage(gl_int)) print(mem_usage(converted_int))
7.87 MB 1.48 MB
7.87 MB是全部为int64类型时的内存占用量
1.48 MB是向下转换后,int类型数据的内存占用量
gl_float = gl.select_dtypes(include=['float64']) converted_float = gl_float.apply(pd.to_numeric,downcast='float') print(mem_usage(gl_float)) print(mem_usage(converted_float))
100.99 MB 50.49 MB
gl_obj = gl.select_dtypes(include=['object']).copy() converted_obj = pd.DataFrame() for col in gl_obj.columns: num_unique_values = len(gl_obj[col].unique()) num_total_values = len(gl_obj[col]) if num_unique_values / num_total_values < 0.5: converted_obj.loc[:,col] = gl_obj[col].astype('category') else: converted_obj.loc[:,col] = gl_obj[col] print(mem_usage(gl_obj)) print(mem_usage(converted_obj))
751.64 MB 51.67 MB