使用utf-8编码将ElementTree直接写入zip_问答-阿里云开发者社区

我想修改大量的XML。它们存储在ZIP文件中。源XML是utf-8编码的（至少是Linux上file工具的猜测），并且具有正确的XML声明：。

目标ZIP和其中包含的XML也应具有正确的XML声明。但是，（至少对我来说）最明显的方法（使用ElementTree.tostring）失败。

这是一个独立的示例，应该可以立即使用。简短演练：

imports
preparations (creating src.zip, these ZIPs are a given in my actual application)
actual work of program (modifying XMLs), starting at ` # read XMLs from zip `

请专注于下部，特别是＃APPROACH 1，APPROACH 2，APPROACH 3：

import os
import tempfile
import zipfile
from xml.etree.ElementTree import Element, parse

src_1 = os.path.join(tempfile.gettempdir(), "one.xml")
src_2 = os.path.join(tempfile.gettempdir(), "two.xml")
src_zip = os.path.join(tempfile.gettempdir(), "src.zip")
trgt_appr1_zip = os.path.join(tempfile.gettempdir(), "trgt_appr1.zip")
trgt_appr2_zip = os.path.join(tempfile.gettempdir(), "trgt_appr2.zip")
trgt_appr3_zip = os.path.join(tempfile.gettempdir(), "trgt_appr3.zip")

# file on hard disk that must be used due to ElementTree insufficiencies
tmp_xml_name = os.path.join(tempfile.gettempdir(), "curr_xml.tmp")

# prepare src.zip
tree1 = ElementTree(Element('hello', {'beer': 'good'}))
tree1.write(os.path.join(tempfile.gettempdir(), "one.xml"), encoding="UTF-8", xml_declaration=True)
tree2 = ElementTree(Element('scnd', {'äkey': 'a value'}))
tree2.write(os.path.join(tempfile.gettempdir(), "two.xml"), encoding="UTF-8", xml_declaration=True)

with zipfile.ZipFile(src_zip, 'a') as src:
    with open(src_1, 'r', encoding="utf-8") as one:
        string_representation = one.read()
    # write to zip
    src.writestr(zinfo_or_arcname="one.xml", data=string_representation.encode("utf-8"))
    with open(src_2, 'r', encoding="utf-8") as two:
        string_representation = two.read()
    # write to zip
    src.writestr(zinfo_or_arcname="two.xml", data=string_representation.encode("utf-8"))
os.remove(src_1)
os.remove(src_2)

# read XMLs from zip
with zipfile.ZipFile(src_zip, 'r') as zfile:

    updated_trees = []

    for xml_name in zfile.namelist():

        curr_file = zfile.open(xml_name, 'r')
        tree = parse(curr_file)
        # modify tree
        updated_tree = tree
        updated_tree.getroot().append(Element('new', {'newkey': 'new value'}))
        updated_trees.append((xml_name, updated_tree))

    for xml_name, updated_tree in updated_trees:

        # write to target file
        with zipfile.ZipFile(trgt_appr1_zip, 'a') as trgt1_zip, zipfile.ZipFile(trgt_appr2_zip, 'a') as trgt2_zip, zipfile.ZipFile(trgt_appr3_zip, 'a') as trgt3_zip:

            #
            # APPROACH 1 [DESIRED, BUT DOES NOT WORK]: write tree to zip-file
            # encoding in XML declaration missing
            #
            # create byte representation of elementtree
            byte_representation = tostring(element=updated_tree.getroot(), encoding='UTF-8', method='xml')
            # write XML directly to zip
            trgt1_zip.writestr(zinfo_or_arcname=xml_name, data=byte_representation)

            #
            # APPROACH 2 [WORKS IN THEORY, BUT DOES NOT WORK]: write tree to zip-file
            # encoding in XML declaration is faulty (is 'utf8', should be 'utf-8' or 'UTF-8')
            #
            # create byte representation of elementtree
            byte_representation = tostring(element=updated_tree.getroot(), encoding='utf8', method='xml')
            # write XML directly to zip
            trgt2_zip.writestr(zinfo_or_arcname=xml_name, data=byte_representation)

            #
            # APPROACH 3 [WORKS, BUT LACKS PERFORMANCE]: write to file, then read from file, then write to zip
            #
            # write to file
            updated_tree.write(tmp_xml_name, encoding="UTF-8", method="xml", xml_declaration=True)
            # read from file
            with open(tmp_xml_name, 'r', encoding="utf-8") as tmp:
                string_representation = tmp.read()
            # write to zip
            trgt3_zip.writestr(zinfo_or_arcname=xml_name, data=string_representation.encode("utf-8"))

    os.remove(tmp_xml_name)

方法3可行，但是它比其他两个资源占用更多的资源。

APPROACH 2 is the only way I could get an ElementTree object to be written with an actual XML declaration -- which then turns out to be invalid ( utf8 instead of UTF-8 / utf-8 ).

APPROACH 1 would be most desired -- but fails during reading later in the pipeline, as the XML declaration is missing.

Question: How can I get rid of writing the whole XML to disk first, only to read it afterwards, write it to the zip and delete it after being done with the zip? What am I missing?

问题来源: stackoverflow

import zipfile from io import BytesIO from xml.etree.ElementTree import ElementTree, Element tree = ElementTree(Element('hello', {'beer': 'good'})) bio = BytesIO() tree.write(bio, encoding='UTF-8', xml_declaration=True) with zipfile.ZipFile('/tmp/test.zip', 'w') as z: z.writestr('test.xml', bio.getvalue())

import zipfile from xml.etree.ElementTree import ElementTree, Element tree = ElementTree(Element('hello', {'beer': 'good'})) with zipfile.ZipFile('/tmp/test.zip', 'w') as z: with z.open('test.xml', 'w') as f: tree.write(f, encoding='UTF-8', xml_declaration=True)

热门

活动广场

任务中心

开发者评测

高校计划

乘风者计划

训练营

阿里云MVP

话题

直播

下载

镜像站

技术资料

插件

使用utf-8编码将ElementTree直接写入zip

相关文章

相关电子书