我使用panda插入方法(https://pandas.pydata.org/pandas-docs/stable/user_guide/io.html#io-sql-method)来获取从.mdb文件转换而来的csv并将它们插入到Postgres中。我试图转换的一个表有120万行,超过了EC2实例的CPU能力。是否有一种方法来修改或分解这个函数来处理这么大的CSV ?其他150K-300K行的表也可以正常工作。谢谢。
import csv
from io import StringIO
from sqlalchemy import create_engine
def psql_insert_copy(table, conn, keys, data_iter):
# gets a DBAPI connection that can provide a cursor
dbapi_conn = conn.connection
with dbapi_conn.cursor() as cur:
s_buf = StringIO()
writer = csv.writer(s_buf)
writer.writerows(data_iter)
s_buf.seek(0)
columns = ', '.join('"{}"'.format(k) for k in keys)
if table.schema:
table_name = '{}.{}'.format(table.schema, table.name)
else:
table_name = table.name
sql = 'COPY {} ({}) FROM STDIN WITH CSV'.format(
table_name, columns)
cur.copy_expert(sql=sql, file=s_buf)
def df_to_postgres(table_name, dataframe):
df = dataframe
engine = create_engine(f'postgresql://{PGusername}:{PGpassword}@{PGendpoint}:5432/testDB')
df.to_sql(table_name.lower(), engine, if_exists='replace', method=psql_insert_copy)
编辑:我在df中添加了一个chucksize参数。我仍然得到一个内存错误。错误说明如下。
File "20191217_Access2PG.py", line 122, in <module>
filter_mdb(args.bucket,args.profile)
File "20191217_Access2PG.py", line 119, in filter_mdb
s3_download(bucket, mdbList)
File "20191217_Access2PG.py", line 103, in s3_download
get_tables(path)
File "20191217_Access2PG.py", line 84, in get_tables
table_data(table_names)
File "20191217_Access2PG.py", line 69, in table_data
df_to_postgres(item, csv_file)
File "20191217_Access2PG.py", line 48, in df_to_postgres
df.to_sql(table_name.lower(), engine, if_exists='replace', method=psql_insert_copy)
File "/home/ubuntu/anaconda3/lib/python3.6/site-packages/pandas/core/generic.py", line 2712, in to_sql
method=method,
File "/home/ubuntu/anaconda3/lib/python3.6/site-packages/pandas/io/sql.py", line 518, in to_sql
method=method,
File "/home/ubuntu/anaconda3/lib/python3.6/site-packages/pandas/io/sql.py", line 1320, in to_sql
table.insert(chunksize, method=method)
File "/home/ubuntu/anaconda3/lib/python3.6/site-packages/pandas/io/sql.py", line 734, in insert
keys, data_list = self.insert_data()
File "/home/ubuntu/anaconda3/lib/python3.6/site-packages/pandas/io/sql.py", line 710, in insert_data
d = np.array(b.get_values(), dtype=object)
MemoryError```
问题来源StackOverflow 地址:/questions/59379277/python-issue-with-inserting-large-dataframe-1-2m-rows-into-postgres
版权声明:本文内容由阿里云实名注册用户自发贡献,版权归原作者所有,阿里云开发者社区不拥有其著作权,亦不承担相应法律责任。具体规则请查看《阿里云开发者社区用户服务协议》和《阿里云开发者社区知识产权保护指引》。如果您发现本社区中有涉嫌抄袭的内容,填写侵权投诉表单进行举报,一经查实,本社区将立刻删除涉嫌侵权内容。