将Doc或者Docx文档处理成html的代码逻辑
下面是maven的配置代码:
<!-- 文档处理所需的jar的依赖 --> <dependency> <groupId>commons-io</groupId> <artifactId>commons-io</artifactId> <version>2.4</version> </dependency> <dependency> <groupId>org.apache.poi</groupId> <artifactId>poi-examples</artifactId> <version>3.9</version> </dependency> <dependency> <groupId>org.apache.poi</groupId> <artifactId>poi-scratchpad</artifactId> <version>3.9</version> </dependency> <dependency> <groupId>fr.opensagres.xdocreport</groupId> <artifactId>org.apache.poi.xwpf.converter.xhtml</artifactId> <version>1.0.4</version> </dependency> <dependency> <groupId>fr.opensagres.xdocreport</groupId> <artifactId>org.apache.poi.xwpf.converter.core</artifactId> <version>1.0.4</version> </dependency> <dependency> <groupId>org.apache.poi</groupId> <artifactId>poi-ooxml</artifactId> <version>3.9</version> </dependency> <dependency> <groupId>org.apache.poi</groupId> <artifactId>poi</artifactId> <version>3.9</version> </dependency> <dependency> <groupId>org.apache.poi</groupId> <artifactId>poi-ooxml-schemas</artifactId> <version>3.9</version> </dependency> <dependency> <groupId>org.apache.xmlbeans</groupId> <artifactId>xmlbeans</artifactId> <version>2.3.0</version> </dependency> <dependency> <groupId>org.apache.poi</groupId> <artifactId>ooxml-schemas</artifactId> <version>1.1</version> </dependency> <!-- 文档处理所需的jar的依赖 -->
将word处理成html的代码:
import java.io.File; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStream; import java.io.OutputStream; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.regex.Matcher; import java.util.regex.Pattern; import javax.xml.parsers.DocumentBuilderFactory; import javax.xml.parsers.ParserConfigurationException; import javax.xml.transform.OutputKeys; import javax.xml.transform.Transformer; import javax.xml.transform.TransformerException; import javax.xml.transform.TransformerFactory; import javax.xml.transform.dom.DOMSource; import javax.xml.transform.stream.StreamResult; import org.apache.commons.io.FileUtils; import org.apache.commons.io.IOUtils; import org.apache.commons.io.output.ByteArrayOutputStream; import org.apache.commons.lang.StringUtils; import org.apache.log4j.Logger; import org.apache.poi.hwpf.HWPFDocument; import org.apache.poi.hwpf.converter.PicturesManager; import org.apache.poi.hwpf.converter.WordToHtmlConverter; import org.apache.poi.hwpf.extractor.WordExtractor; import org.apache.poi.hwpf.usermodel.Picture; import org.apache.poi.hwpf.usermodel.PictureType; import org.apache.poi.xwpf.converter.core.BasicURIResolver; import org.apache.poi.xwpf.converter.core.FileImageExtractor; import org.apache.poi.xwpf.converter.core.FileURIResolver; import org.apache.poi.xwpf.converter.xhtml.XHTMLConverter; import org.apache.poi.xwpf.converter.xhtml.XHTMLOptions; import org.apache.poi.xwpf.usermodel.XWPFDocument; import org.apache.poi.xwpf.usermodel.XWPFParagraph; import org.apache.poi.xwpf.usermodel.XWPFTable; import org.apache.poi.xwpf.usermodel.XWPFTableCell; import org.apache.poi.xwpf.usermodel.XWPFTableRow; import org.w3c.dom.Document; import com.sun.org.apache.xalan.internal.xsltc.compiler.Template; import cn.com.hbny.docdetection.entity.ResourcesWord; import cn.com.hbny.docdetection.server.ExtendedServerConfig; import cn.com.hbny.docdetection.utils.Pinyin4jUtils.PinyinType; /** * @brief ReadWordUtils.java 文档处理对应的工具类 * @attention * @author toto * @date 2017年3月3日 * @note begin modify by 涂作权 2017年3月3日 原始创建 */ public final class ReadWordUtils { private static Logger logger = Logger.getLogger(ReadWordUtils.class); protected static final String CHARSET_UTF8 = "UTF-8"; private static String tempImagePath = ""; /** * 读取docx * @throws Exception */ public static ResourcesWord readDocx(String path) throws Exception { int paragNum = 0; // 段落的个数 int sentenceNum = 0; // 句子个数 int wordNum = 0; // 字体个数 StringBuffer content = new StringBuffer(); ResourcesWord resourcesWord = new ResourcesWord(); InputStream is = new FileInputStream(path); XWPFDocument doc = new XWPFDocument(is); List<XWPFParagraph> paras = doc.getParagraphs(); for (XWPFParagraph para : paras) { // 当前段落的属性 if (!StringUtils.isEmpty(para.getText())) { paragNum++; sentenceNum += para.getText().replace("\r\n", "").trim().split("。").length; content.append(para.getText()); } } // 获取文档中所有的表格 List<XWPFTable> tables = doc.getTables(); List<XWPFTableRow> rows; List<XWPFTableCell> cells; for (XWPFTable table : tables) { // 表格属性 // 获取表格对应的行 rows = table.getRows(); for (XWPFTableRow row : rows) { // 获取行对应的单元格 cells = row.getTableCells(); for (XWPFTableCell cell : cells) { content.append(cell.getText()); } } /* * MongoDBUtils mongoDb = new MongoDBUtils("javadb"); DBObject dbs = * new BasicDBObject(); dbs.put("name", "创新性"); //分类 * dbs.put("major", "医疗"); //专业 dbs.put("content", * content.toString().trim()); dbs.put("paragNum", paragNum); * dbs.put("sentenceNum", sentenceNum); dbs.put("wordNum", wordNum); * mongoDb.insert(dbs, "javadb"); */ } // 得到全部内容的字数 wordNum += content.toString().trim().length(); resourcesWord.setContent(content.toString()); resourcesWord.setParagNum(paragNum); resourcesWord.setSentenceNum(sentenceNum); resourcesWord.setWordNum(wordNum); close(is); return resourcesWord; } /** * 读取doc文件的内容 * * @throws IOException */ public static ResourcesWord readDoc(String path) throws IOException { int paragNum = 0; // 段落的个数 int sentenceNum = 0; // 句子个数 int wordNum = 0; // 字体个数 ResourcesWord resourcesWord = new ResourcesWord(); StringBuffer content = new StringBuffer(); try { File f = new File(path); FileInputStream is = new FileInputStream(f); WordExtractor ex = new WordExtractor(is);// is是WORD文件的InputStream String[] paragraph = ex.getParagraphText(); for (int i = 0; i < paragraph.length; i++) { paragNum++; System.out.println("Paragraph " + (i + 1) + " : " + paragraph[i]); sentenceNum += paragraph[i].replace("\r\n", "").trim().split("。").length; wordNum += paragraph[i].trim().length(); content.append(paragraph[i].trim()); } System.out.println("段落:" + paragNum); System.out.println("句子:" + sentenceNum); System.out.println("字体:" + wordNum); resourcesWord.setContent(content.toString()); resourcesWord.setParagNum(paragNum); resourcesWord.setSentenceNum(sentenceNum); resourcesWord.setWordNum(wordNum); /* * MongoDBUtils mongoDb = new MongoDBUtils("javadb"); DBObject dbs = * new BasicDBObject(); dbs.put("name", "创新性"); //分类 * dbs.put("major", "医疗"); //专业 dbs.put("content", * content.toString()); dbs.put("paragNum", paragNum); * dbs.put("sentenceNum", sentenceNum); dbs.put("wordNum", wordNum); * mongoDb.insert(dbs, "javadb"); */ is.close(); } catch (Exception e) { e.printStackTrace(); } return resourcesWord; } /** * \brief doc转换成html,并返回输出的相对路径 * @param filePath :要转换的doc文档 * @param outPutFilePath :文档输出的位置 * @attention * @author toto * @throws IOException * @throws FileNotFoundException * @throws ParserConfigurationException * @date 2017年2月27日 * @note begin modify by 涂作权 2017年2月27日 原始创建 */ public static String doc2Html( String filePath, final String outPutFilePath) throws TransformerException, IOException, ParserConfigurationException { HWPFDocument wordDocument = new HWPFDocument(new FileInputStream(filePath)); WordToHtmlConverter wordToHtmlConverter = new WordToHtmlConverter( DocumentBuilderFactory .newInstance() .newDocumentBuilder() .newDocument()); wordToHtmlConverter.setPicturesManager(new PicturesManager() { public String savePicture(byte[] content, PictureType pictureType, String suggestedName, float widthInches, float heightInches) { //File file = new File(outPutFilePath); //String name = file.getName(); tempImagePath = outPutFilePath.substring(0,outPutFilePath.indexOf(".html")) + File.separator; File imageFolder = new File(tempImagePath); if (!imageFolder.exists()) { try { FileUtils.forceMkdir(imageFolder); } catch (IOException e) { e.printStackTrace(); } } String newTempImagePath = imageFolder.getPath().replace(imageFolder.getParentFile().getPath() + File.separator, ""); return newTempImagePath + File.separator + suggestedName; } }); wordToHtmlConverter.processDocument(wordDocument); // 保存图片 List<Picture> pics = wordDocument.getPicturesTable().getAllPictures(); if (pics != null) { for (int i = 0; i < pics.size(); i++) { Picture pic = (Picture) pics.get(i); try { File picOutFolder = new File(tempImagePath + File.separator); if (!picOutFolder.exists()) { picOutFolder.mkdirs(); } pic.writeImageContent(new FileOutputStream(tempImagePath + File.separator + pic.suggestFullFileName())); } catch (FileNotFoundException e) { e.printStackTrace(); } } } Document htmlDocument = wordToHtmlConverter.getDocument(); ByteArrayOutputStream out = new ByteArrayOutputStream(); DOMSource domSource = new DOMSource(htmlDocument); StreamResult streamResult = new StreamResult(out); TransformerFactory tf = TransformerFactory.newInstance(); Transformer serializer = tf.newTransformer(); serializer.setOutputProperty(OutputKeys.ENCODING, "utf-8"); serializer.setOutputProperty(OutputKeys.INDENT, "yes"); serializer.setOutputProperty(OutputKeys.METHOD, "html"); serializer.transform(domSource, streamResult); out.close(); writeFile(new String(out.toByteArray()), outPutFilePath); return gainRelativePathByOutputPath(outPutFilePath); } /** * 将docx格式的word转换为html格式的文档 * * @param filePath 原始的docx文件路径存储位置 * @param outPutFile html输出文件路径 * @return * @throws TransformerException * @throws IOException * @throws ParserConfigurationException */ public static String docx2Html(String filePath, final String outPutFilePath) throws TransformerException, IOException, ParserConfigurationException { //String fileOutName = outPutFile; XWPFDocument wordDocument = new XWPFDocument(new FileInputStream(filePath)); XHTMLOptions options = XHTMLOptions.create().indent(4); // 导出图片 Map<String, String> imageInfoMap = gainTempImagePath(outPutFilePath); File imageFolder = new File(imageInfoMap.get("imageStoredPath")); options.setExtractor(new FileImageExtractor(imageFolder)); // URI resolver //这种方式获得word中的图片地址是绝对地址 //options.URIResolver(new FileURIResolver(imageFolder)); //设置生成的html中的img src中的地址是相对路径 options.URIResolver(new BasicURIResolver(imageInfoMap.get("imageFolder"))); File outFile = new File(outPutFilePath); outFile.getParentFile().mkdirs(); OutputStream out = new FileOutputStream(outFile); XHTMLConverter.getInstance().convert(wordDocument, out, options); return gainRelativePathByOutputPath(outPutFilePath); //System.out.println("Generate " + fileOutName + " with " + (System.currentTimeMillis() - startTime) + " ms."); } /** * \brief 将内容写到path路径下面 * @param content :文档内容 * @param path :最终的文件存储路径 * @attention 方法的使用注意事项 * @author toto * @date 2017年2月27日 * @note begin modify by 涂作权 2017年2月27日 修改输出的文件名称 */ public static void writeFile(String docContent, String path) { FileOutputStream outDocFos = null; try { //判断文件是否为空的 if (StringUtils.isNotBlank(path)) { File file = new File(path); if (!file.exists()) { FileUtils.forceMkdir(file.getParentFile()); } outDocFos = new FileOutputStream(path); IOUtils.write(docContent, outDocFos,CHARSET_UTF8); } } catch (FileNotFoundException fnfe) { fnfe.printStackTrace(); } catch (IOException ioe) { ioe.printStackTrace(); } finally { try { if (outDocFos != null) outDocFos.close(); } catch (IOException ie) { } } } /** * 关闭输入流 * * @param is */ private static void close(InputStream is) { if (is != null) { try { is.close(); } catch (IOException e) { e.printStackTrace(); } } } /** * \brief 通过文档输出路径获得图片存储路径 * @param outPutFile :文档输出路径 * @return * @attention 方法的使用注意事项 * @author toto * @date 2017年2月28日 * @note begin modify by 修改人 修改时间 修改内容摘要说明 */ private static Map<String, String> gainTempImagePath(String outPutFilePath) { Map<String,String> imageInfoMap = new HashMap<String,String>(); try { //File file = new File(outPutFilePath); tempImagePath = outPutFilePath.substring(0,outPutFilePath.indexOf(".html")) + File.separator; File imageFolder = new File(tempImagePath); if (!imageFolder.exists()) { try { FileUtils.forceMkdir(imageFolder); } catch (IOException e) { e.printStackTrace(); } } //System.out.println(imageFolder.getPath().replace(imageFolder.getParentFile().getPath() + File.separator, "")); //return imageFolder.getPath().replace(imageFolder.getParentFile().getPath() + File.separator, ""); imageInfoMap.put("imageStoredPath", imageFolder.getPath()); imageInfoMap.put("imageFolder", imageFolder.getPath().replace(imageFolder.getParentFile().getPath(), "").replace(File.separator, "")); return imageInfoMap; } catch (Exception e) { e.printStackTrace(); } return null; } private static String gainRelativePathByOutputPath(String outPutFilePath) { //用于预览的存储路径 String docsPreviewPath = ExtendedServerConfig.getInstance().getStringProp("DOCS_PREVIEW_PREFIX"); return outPutFilePath.split(docsPreviewPath)[1]; } /** * \brief * @param orgStr :表示要替换的就得字符串 * @param regEx :表示的是正则表达式 * @param targetStr :表示要替换的字符串 * @return * @attention 方法的使用注意事项 * @author toto * @date 2017年3月4日 * @note begin modify by 涂作权 原始创建 2017年3月4日 */ public static String replaceStr(String orgStr,String regEx,String targetStr){ if (null !=orgStr && !"".equals(orgStr.trim())) { //String regEx="[\\s~·`!!@#¥$%^……&*(())\\-——\\-_=+【\\[\\]】{{}}\\|、\\\\;;::‘'“”\",,《<。.》>、/??]"; Pattern p = Pattern.compile(regEx); Matcher m = p.matcher(orgStr); return m.replaceAll(targetStr); } return null; } public static void main(String[] args) throws Exception { // String uploadFile = ExtendedServerConfig.getInstance().getStringProperty("UPLOAD_PATH"); // String docsTempPath = ExtendedServerConfig.getInstance().getStringProperty("DOCS_TEMP_PATH"); // String docsOutputPath = ExtendedServerConfig.getInstance().getStringProp("DOCS_OUTPUT_PATH"); // System.out.println("uploadFile = " + uploadFile + " " + docsTempPath + " " + docsOutputPath); // // Testtest.readWord("E://111.doc"); // Testtest.readDoc(); // System.out.println(content); // ResourcesWord readDocx = ReadWordUtils.readDoc(uploadFile + "/大学生创新创业项目申报书.doc"); // logger.info(readDocx.getContent()); // logger.info(readDocx.getParagNum()); // // new ReadWordUtils().doc2Html(uploadFile + "/大学生创新创业项目申报书.doc" , docsOutputPath + "/大学生创新创业项目申报书.html"); //new ReadWordUtils().docx2Html(uploadFile + "/大学生创新创业项目申报书副本.docx" , docsOutputPath + "/大学生创新创业项目申报书副本.html"); String newStr = replaceStr("afdas//\\as dfasd a//asd\\\\\\asd\\/", "[\\\\]","/"); newStr = replaceStr(newStr, "(/){1,}", "/"); newStr = replaceStr(newStr, "[ ]", ""); System.out.println(newStr); } }
下面是调用案例:
import java.io.File; import org.apache.log4j.Logger; import org.springframework.stereotype.Service; import cn.com.hbny.docdetection.mongodb.beans.DocInfo; import cn.com.hbny.docdetection.server.ExtendedServerConfig; import cn.com.hbny.docdetection.service.base.impl.BaseServiceImpl; import cn.com.hbny.docdetection.service.docInfoHandler.DocInfoHandlerService; import cn.com.hbny.docdetection.utils.Pinyin4jUtils; import cn.com.hbny.docdetection.utils.ReadWordUtils; import cn.com.hbny.docdetection.utils.UUIDGenerator; import cn.com.hbny.docdetection.utils.Pinyin4jUtils.PinyinType; /** * @brief DocInfoHandlerServiceImpl.java 文档检测对应的文档 * @attention * @author toto * @date 2017年3月2日 * @note begin modify by 涂作权 2017年3月2日 原始创建 */ @Service(value = "docInfoHandlerService") public class DocInfoHandlerServiceImpl extends BaseServiceImpl implements DocInfoHandlerService { private static Logger logger = Logger.getLogger(DocInfoHandlerServiceImpl.class); /** * 文档处理对应的service * @param docLibrayId :文档库对应的id * @param originalDocPath :原始文档所在的位置 * @param uploadPath :文档上传路径 * @param outPutFolderPath :文档最终的输出文件夹 * @param docsPreviewPrefix :文档预览的前缀 */ public DocInfo handlerSingleDocInfo( String docLibrayId, String originalDocPath, String uploadPath, String outPutFolderPath, String docsPreviewPrefix) { try { DocInfo docInfo = new DocInfo(); docInfo.setId(UUIDGenerator.generate()); docInfo.setDocLibrayId(docLibrayId); //处理传递过来的文件路径 File file = new File(originalDocPath); //判断文件是否哦存在,如果不存在直接返回,如果存在继续下面的操作 if (file.exists()) { //获取到文档的名称 String fileName = file.getName(); docInfo.setOriginalFileName(fileName.substring(0,fileName.toLowerCase().indexOf(".doc"))); //截取上传文件的后面那一串路径 String fileRelativePath = originalDocPath.substring(uploadPath.length()); docInfo.setOriginalDocPath(fileRelativePath); //判断文件后缀 if (fileName.endsWith(".doc")) { //1、处理word文档,并将word文档存储在相应的位置上,将word存储成html String outPutFilePath = Pinyin4jUtils.toPinYin( outPutFolderPath + fileRelativePath.replace(".doc", ".html"), PinyinType.LOWERCASE); outPutFilePath = ReadWordUtils.replaceStr(outPutFilePath, "[\\\\]","/"); outPutFilePath = ReadWordUtils.replaceStr(outPutFilePath, "(/){1,}", "/"); outPutFilePath = ReadWordUtils.replaceStr(outPutFilePath, "[ ]", ""); //下面是经过处理后的文件存储位置 String filePathAfterHandled = ReadWordUtils.doc2Html(originalDocPath,outPutFilePath); docInfo.setHtmlDocPath(filePathAfterHandled); } else { //1、处理word文档,并将word文档存储在相应的位置上,将word存储成html //1、处理word文档,并将word文档存储在相应的位置上,将word存储成html String outPutFilePath = Pinyin4jUtils.toPinYin( outPutFolderPath + fileRelativePath.replace(".docx", ".html"), PinyinType.LOWERCASE); outPutFilePath = ReadWordUtils.replaceStr(outPutFilePath, "[\\\\]","/"); outPutFilePath = ReadWordUtils.replaceStr(outPutFilePath, "(/){1,}", "/"); outPutFilePath = ReadWordUtils.replaceStr(outPutFilePath, "[ ]", ""); //下面是经过处理后的文件存储位置 String filePathAfterHandled = ReadWordUtils.docx2Html(originalDocPath, outPutFilePath); docInfo.setHtmlDocPath(filePathAfterHandled); } return null; } else { return null; } } catch (Exception e) { e.printStackTrace(); } return null; } public static void main(String[] args) { String uploadPath = ExtendedServerConfig.getInstance().getStringProperty("UPLOAD_PATH"); String outPutFolderPath = ExtendedServerConfig.getInstance().getStringProperty("DOCS_OUTPUT_PATH"); String docsPreviewPrefix = ExtendedServerConfig.getInstance().getStringProperty("DOCS_PREVIEW_PREFIX"); // new DocInfoHandlerServiceImpl().handlerSingleDocInfo( // UUIDGenerator.generate(), // uploadPath + "/双创项目申报书20170301/国家大学生创新训练计划项目申请书华师大.doc", // uploadPath, // outPutFolderPath); // new DocInfoHandlerServiceImpl().handlerSingleDocInfo( // UUIDGenerator.generate(), // uploadPath + "/双创项目申报书20170301/国家级大学生创新创业训练计划 立项申请书 上海电力学院.doc", // uploadPath, // outPutFolderPath, // docsPreviewPrefix); new DocInfoHandlerServiceImpl().handlerSingleDocInfo( UUIDGenerator.generate(), uploadPath + "/双创项目申报书20170301/专题产品需求规格说明书.docx", uploadPath, outPutFolderPath, docsPreviewPrefix); } }
下面是所以用到的参数配置:
#上传的文件的存储位置的配置,统一的最后面不要加斜杠 UPLOAD_PATH=D:/installed/apache-tomcat-7.0.47/webapps/upload ##处理后的文档输出位置,统一的最后面不要加斜杠 DOCS_OUTPUT_PATH=D:/installed/apache-tomcat-7.0.47/webapps/docs-output-path ##文档预览路径,注意最后面不要加斜杠 DOCS_PREVIEW_PREFIX=/docs-output-path ##处理文档是,生成的一些图片的临时存储路径,最后面不要加斜杠 DOCS_TEMP_PATH=D:/installed/apache-tomcat-7.0.47/webapps/temp