import time # 引入正常的pandas的模块 import pandas as pd # 引入该模块 import modin.pandas as mpd def test_pd_time(path): start = time.time() data=pd.read_csv(path) end= time.time() print('pd consume time is:',end-start) def test_mpd_time(path): start = time.time() data=mpd.read_csv(path) end = time.time() print('modin pd consume time is:',end-start) path1='/home/yjj/data_oanda/AUD_CAD.csv' path2='/opt/oanda_pair_rate.csv' # 测试一个大样本的数据 print('大样本测试') test_pd_time(path1) test_mpd_time(path1) # 测试一个小样本 print('大样小测试') test_pd_time(path2) test_mpd_time(path2)
大样本测试(2.5G左右)
pd consume time is: 36.11769914627075
modin pd consume time is: 8.59299921989441
大样小测试(100M左右)
pd consume time is: 0.00580286979675293
modin pd consume time is: 0.028467655181884766
注:处理大文件的时候,1个G以上,建议用modin.pandas,处理小文件,建议用pandas