中文csv文本编码转utf8那些事 - python
请参考以下代码:
# -*- coding: utf-8 -*- ##!/usr/bin/python3 # @Author : Jack Lee # @Email : 291148484@163.com import os import time import codecs import chardet class CodeError(ValueError):pass def get_time() -> str: return str(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())) def which_codes(filepath): # c = codecs.open(filepath,"r") # print(c) # return c.encoding with open(filepath,'rb') as f: content = f.read() return chardet.detect(content).get('encoding').lower() def open_gbk_as_str(filepath)-> str: with open(filepath,'r',encoding='gbk') as f: content = f.read() return content def open_big5_as_str(filepath)-> str: with open(filepath,'r',encoding='big5') as f: content = f.read() return content def open_big5hkscs_as_str(filepath)-> str: with open(filepath,'r',encoding='big5hkscs') as f: content = f.read() return content def open_cp950_as_str(filepath)-> str: with open(filepath,'r',encoding='cp950') as f: content = f.read() return content def open_gb2312_as_str(filepath)-> str: with open(filepath,'r',encoding='gb2312') as f: content = f.read() return content def open_hz_as_str(filepath)-> str: with open(filepath,'r',encoding='hz') as f: content = f.read() return content def open_ascii_as_str(filepath)-> str: with open(filepath,'r',encoding='ascii') as f: content = f.read() return content def open_utf8_as_str(filepath)-> str: with open(filepath,'r',encoding='utf-8') as f: content = f.read() return content def open_utf16_as_str(filepath)-> str: with open(filepath,'rb') as f: content = f.read() return str(content,encoding='utf-16') def open_utf32_as_str(filepath)-> str: with open(filepath,'rb') as f: content = f.read() return str(content,encoding='utf-32') def open_as_str(filepath): codes = which_codes(filepath) # gbk encode if codes == "936" or codes == "cp936" or codes == "ms936" or codes == "gbk": print("[INFO] "+get_time()+" GBK, codes =",codes) return open_gbk_as_str(filepath).encode(encoding='utf-8').decode(encoding='utf-8') # big5 encode if codes == "big5" or codes == "big5-tw" or codes == "csbig5": print("[INFO] "+get_time()+" big5, codes =",codes) return open_big5_as_str(filepath).encode(encoding='utf-8').decode(encoding='utf-8') # big5hkscs encode if codes == "big5hkscs" or codes == "big5-hkscs" or codes == "hkscs": print("[INFO] "+get_time()+" big5hkscs, codes =",codes) return open_big5hkscs_as_str(filepath).encode(encoding='utf-8').decode(encoding='utf-8') # cp950 encode if codes == "cp950" or codes == "950" or codes == "ms950": print("[INFO] "+get_time()+" cp950, codes =",codes) return open_cp950_as_str(filepath).encode(encoding='utf-8').decode(encoding='utf-8') # gb2312 encode if codes == "gb2312" or codes == "chinese" or codes == "csiso58gb231280" or codes == "euc-cn" or codes == "euccn" or codes == "eucgb2312-cn" or codes == "gb2312-1980" or codes == "gb2312-80" or codes == "iso-ir-58": print("[INFO] "+get_time()+" gb2312, codes =",codes) return open_gb2312_as_str(filepath).encode(encoding='utf-8').decode(encoding='utf-8') # hz encode if codes == "hz" or codes == "hzgb" or codes == "hz-gb" or codes == "hz-gb-2312": print("[INFO] "+get_time()+" hz, codes =",codes) return open_cp950_as_str(filepath).encode(encoding='utf-8').decode(encoding='utf-8') # ascii encode elif codes == 'ascii': print("[INFO] "+get_time()+" encoding = ascii") return open_ascii_as_str(filepath).encode(encoding='utf-8').decode(encoding='utf-8') # utf-16 encode elif codes == 'utf-16' or codes == 'U16' or codes == 'utf16' or codes == 'utf_16': print("[INFO] "+get_time()+" utf-16, codes =",codes) return open_utf16_as_str(filepath).encode(encoding='utf-8').decode(encoding='utf-8') # utf-32 encode elif codes == 'utf-32' or codes == 'U32' or codes == 'utf32' or codes == 'utf_32': print("[INFO] "+get_time()+" utf-32, codes =",codes) return open_utf32_as_str(filepath).encode(encoding='utf-8').decode(encoding='utf-8') # utf-8 encode elif codes == "utf-8" or codes == "U8" or codes == "cp65001" or codes == "utf8" or codes == "UTF": print("[INFO] "+get_time()+" utf-8, codes =",codes) return open_utf8_as_str(filepath) # other unrecognized codes: else: print('[CRITICAL] '+get_time()+' The current encoding used is:"'+codes+'", the program failed to process this encoding.') raise CodeError('Text file:"'+filepath+'" which encoding method cannot be read.') # Used to replace the specified csv with the corresponding utf8 encoded csv. def replace_by_utf8_csv(filepath): s = "" l = open_as_str(filepath).split('\n\r') for i in l: if i!="": s=s+i with open(filepath, 'w', encoding='utf-8') as f: f.write(s) print('[INFO] '+get_time()+' Translated file '+ filepath +' into utf-8.') # print(open_as_str(r'C:\Users\a2911\Desktop\script\sources\aaa.csv'))
说明:
- 调用
open_as_str
函数用于打开一个文本文件,得到相应的 utf-8 字符串;- 调用
replace_by_utf8_csv
用于,将一个非 utf-8 的 csv 替换为同名 utf-8 的 csv。