引言
最近有个项目需要解析xml 文件,获取其中的节点内容, 小编选择了一个编码简单又高效的dom4j来完成。
1、xml内容
<?xml version="1.0" encoding="UTF-8"?> <RecognizeResult> <Speech Uri="/Sub/2019-12-03.3/file/5149-15892322607-20191202141010-rJKTcXfpB_datang.wav" Duration="252840"> <ResultCode>0</ResultCode> <Confidence>100</Confidence> <Subject Name="RecognizeText"> <Role Name="R0"> <EndPoint Count="44"> <Item Begin="13340" End="13450"> <Text>喂。 </Text> <Time>13340,13450 </Time> </Item> <Item Begin="15860" End="16240"> <Text>喂。 </Text> <Time>15860,16240 </Time> </Item> </EndPoint> </Role> <Role Name="R1"> <EndPoint Count="35"> <Item Begin="17990" End="20080"> <Text>哎 喂 是 王 斌 先生 是吗 啊! </Text> <Time>17990,18100 18100,18340 18340,18550 18550,18940 18940,19120 19120,19510 19510,19820 19860,20080 </Time> </Item> <Item Begin="20630" End="21190"> <Text>对 是啊! </Text> <Time>20630,20860 20860,21190 </Time> </Item> </EndPoint> </Role> </Subject> </Speech> </RecognizeResult>
需求是,将其中的汉子分角色(R0,R1)解析出来,并且拼接成字符串,然后发送给消息队列。
2、引入jar包
<dependency> <groupId>dom4j</groupId> <artifactId>dom4j</artifactId> <version>1.6.1</version> </dependency>
3、代码实现
package com.zj.zhijian.service; import com.zqf.common.utils.DateUtils; import org.dom4j.Attribute; import org.dom4j.Document; import org.dom4j.DocumentException; import org.dom4j.Element; import org.dom4j.io.SAXReader; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.springframework.beans.factory.annotation.Value; import org.springframework.stereotype.Service; import java.io.File; import java.util.ArrayList; import java.util.Iterator; import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; /** * @author zhenghao * @description: 解析xml * @date 2019/12/318:33 */ @Service public class ParseXmlService { private static Logger log = LoggerFactory.getLogger(ParseXmlService.class); @Value("${base.file.path}") private String baseFilePath; //获得文件 测试方法 public void xmlFile() { //String toDayStartYMD = DateUtils.getToDayStartYMD(); String date = "2019-12"; for (int j = 1; j <= 4; j++) { String toDayStartYMD = date + "-0" + j; for (int i = 0; i < 24; i++) { String filePath = baseFilePath + toDayStartYMD + "." + i + "/file/"; log.info("文件路径" + filePath); parseXml(filePath); } } } public void parseXml(String strFile) { try { long l = System.currentTimeMillis(); List<String> R0List = new ArrayList<>(); List<String> R1List = new ArrayList<>(); File file = new File(strFile); String[] filePath = file.list(); if (filePath == null || filePath.length <= 0) { return; } log.info("xml个数" + filePath.length); for (String s : filePath) { if (!s.contains(".xml")) { continue; } String tempFilePath = strFile + s; //1.创建Reader对象 SAXReader reader = new SAXReader(); //2.加载xml Document document = reader.read(new File(tempFilePath)); //3.获取根节点 Element rootElement = document.getRootElement(); StringBuilder sb = new StringBuilder(); //4、获得指定子节点 Element speechElement = rootElement.element("Speech"); //5、获得节点属性 Attribute duration = speechElement.attribute("Duration"); String value = duration.getValue(); int telLength = Integer.valueOf(value) / 1000; if (telLength <= 45) { continue; } //默认返回第一节点 Element subjectElement = speechElement.element("Subject"); if (subjectElement == null) { continue; } Iterator iterator3 = subjectElement.elementIterator(); while (iterator3.hasNext()) { Element roleElement = (Element) iterator3.next(); Attribute name = roleElement.attribute("Name"); Element endPointElement = roleElement.element("EndPoint"); //获得所有子节点 Iterator iterator1 = endPointElement.elementIterator(); while (iterator1.hasNext()) { Element itemElement = (Element) iterator1.next(); Element textElement = itemElement.element("Text"); String stringValue = textElement.getStringValue(); if (name.getValue().equals("R0")) { sb.append(stringValue); R0List.add(stringValue); } else { R1List.add(stringValue); } } } } System.out.println(System.currentTimeMillis() - l); } catch (DocumentException e) { e.printStackTrace(); } }
4、多种解析xml方式对比,请参考下面文章