word转html

本文涉及的产品
公共DNS(含HTTPDNS解析),每月1000万次HTTP解析
全局流量管理 GTM,标准版 1个月
云解析 DNS,旗舰版 1个月
简介: word转html


  1. 添加jar
<dependency>
    <groupId>org.apache.commons</groupId>
    <artifactId>commons-compress</artifactId>
    <version>1.19</version>
</dependency>
<dependency>
    <groupId>org.apache.xmlbeans</groupId>
    <artifactId>xmlbeans</artifactId>
    <version>3.1.0</version>
</dependency>
<dependency>
    <groupId>org.apache.poi</groupId>
    <artifactId>poi</artifactId>
    <version>4.1.2</version>
</dependency>
<!-- 针对2007以上版本的库 -->
<dependency>
    <groupId>org.apache.poi</groupId>
    <artifactId>poi-ooxml</artifactId>
    <version>4.1.2</version>
</dependency>
<!-- 针对2003版本的库 -->
<dependency>
    <groupId>org.apache.poi</groupId>
    <artifactId>poi-scratchpad</artifactId>
    <version>4.1.2</version>
</dependency>
<dependency>
    <groupId>fr.opensagres.xdocreport</groupId>
    <artifactId>fr.opensagres.poi.xwpf.converter.xhtml</artifactId>
    <version>2.0.3</version>
</dependency>
<dependency>
    <groupId>fr.opensagres.xdocreport</groupId>
    <artifactId>fr.opensagres.poi.xwpf.converter.core</artifactId>
    <version>2.0.3</version>
</dependency>
<!-- jsoup -->
<dependency>
    <groupId>org.jsoup</groupId>
    <artifactId>jsoup</artifactId>
    <version>1.11.3</version>
</dependency>
<!-- hutool-->
<dependency>
    <groupId>cn.hutool</groupId>
    <artifactId>hutool-all</artifactId>
    <version>5.0.2</version>
</dependency>

2. 代码生成

package com.gccx.core.util;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Element;
import java.util.HashMap;
import java.util.Map;
public class JsoupUtils {
    private static Map<String, String> getHtmlCss(String html) {
        org.jsoup.nodes.Document doc = Jsoup.parse(html);
        String[] styles = doc.head().select("style").html().split("\r\n");
        Map<String, String> css = new HashMap<>();
        for (String style : styles) {
            String[] kv = style.split("\\{|\\}");
            css.put(kv[0], kv[1]);
        }
        return css;
    }
    public static String changeHtmlCssLineStyle(String html) {
        Map<String, String> css = getHtmlCss(html);
        org.jsoup.nodes.Document doc = Jsoup.parse(html);
        Element body = doc.body();
        for (String key : css.keySet()) {
            body.select(key).attr("style", css.get(key)).outerHtml();
        }
        return body.html();
    }
}
package com.gccx.core.util;
import cn.hutool.core.collection.CollUtil;
import cn.hutool.core.io.FileUtil;
import cn.hutool.core.util.URLUtil;
import fr.opensagres.poi.xwpf.converter.xhtml.XHTMLConverter;
import fr.opensagres.poi.xwpf.converter.xhtml.XHTMLOptions;
import org.apache.commons.fileupload.FileItem;
import org.apache.commons.fileupload.FileItemFactory;
import org.apache.commons.fileupload.disk.DiskFileItemFactory;
import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.converter.PicturesManager;
import org.apache.poi.hwpf.converter.WordToHtmlConverter;
import org.apache.poi.hwpf.usermodel.PictureType;
import org.apache.poi.xwpf.usermodel.*;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.web.multipart.MultipartFile;
import org.springframework.web.multipart.commons.CommonsMultipartFile;
import org.w3c.dom.Document;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.transform.OutputKeys;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerException;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;
import java.io.*;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
public class WordToHtmlUtil {
    /**
     * logger
     */
    private static final Logger logger = LoggerFactory.getLogger(WordToHtmlUtil.class);
    /**
     * 解析docx成html
     *
     * @param file
     * @return
     * @throws IOException
     */
    public static String Word2007ToHtml(MultipartFile file) throws IOException {
        if (file.isEmpty() || file.getSize() <= 0) {
            logger.error("Sorry File does not Exists!");
            return null;
        } else {
            if (file.getOriginalFilename().endsWith(".docx") || file.getOriginalFilename().endsWith(".DOCX")) {
                // 1) 加载word文档生成 XWPFDocument对象
                InputStream in = file.getInputStream();
                XWPFDocument document = new XWPFDocument(in);
                // 也可以使用字符数组流获取解析的内容
                ByteArrayOutputStream baos = new ByteArrayOutputStream();
                XHTMLConverter.getInstance().convert(document, baos, null);
                String content = baos.toString();
                baos.close();
                return content;
            } else {
                logger.error("Enter only MS Office 2007+ files");
                return null;
            }
        }
    }
    /**
     * 解析doc文章成html 不存图片
     *
     * @param file
     * @return
     * @throws IOException
     * @throws ParserConfigurationException
     * @throws TransformerException
     */
    public static String Word2003ToHtml(MultipartFile file)
            throws IOException, ParserConfigurationException, TransformerException {
        if (file.isEmpty() || file.getSize() <= 0) {
            logger.error("Sorry File does not Exists!");
            return null;
        } else {
            if (file.getOriginalFilename().endsWith(".doc") || file.getOriginalFilename().endsWith(".DOC")) {
                InputStream input = file.getInputStream();
                HWPFDocument wordDocument = new HWPFDocument(input);
                WordToHtmlConverter wordToHtmlConverter = new WordToHtmlConverter(
                        DocumentBuilderFactory.newInstance().newDocumentBuilder().newDocument());
                // 解析word文档
                wordToHtmlConverter.processDocument(wordDocument);
                Document htmlDocument = wordToHtmlConverter.getDocument();
                // 也可以使用字符数组流获取解析的内容
                ByteArrayOutputStream baos = new ByteArrayOutputStream();
                DOMSource domSource = new DOMSource(htmlDocument);
                StreamResult streamResult = new StreamResult(baos);
                TransformerFactory factory = TransformerFactory.newInstance();
                Transformer serializer = factory.newTransformer();
                serializer.setOutputProperty(OutputKeys.ENCODING, "utf-8");
                serializer.setOutputProperty(OutputKeys.INDENT, "yes");
                serializer.setOutputProperty(OutputKeys.METHOD, "html");
                serializer.transform(domSource, streamResult);
                // 也可以使用字符数组流获取解析的内容
                String content = new String(baos.toByteArray());
                baos.close();
                return content;
            } else {
                logger.error("Enter only MS Office 2003 files");
                return null;
            }
        }
    }
    /**
     * 解析doc成html 并保存图片文件到本地
     *
     * @param file
     * @return
     * @throws IOException
     * @throws ParserConfigurationException
     * @throws TransformerException
     */
    public static String Word2003ToHtmlAndSaveImage(String docsTempImages, MultipartFile file)
            throws IOException, ParserConfigurationException, TransformerException {
        if (file.isEmpty() || file.getSize() <= 0) {
            logger.error("Sorry File does not Exists!");
            return null;
        } else {
            if (file.getOriginalFilename().endsWith(".doc") || file.getOriginalFilename().endsWith(".DOC")) {
                HWPFDocument wordDocument = new HWPFDocument(file.getInputStream());
                WordToHtmlConverter wordToHtmlConverter = new WordToHtmlConverter(DocumentBuilderFactory.newInstance().newDocumentBuilder().newDocument());
                //设置图片存放的位置
                wordToHtmlConverter.setPicturesManager(new PicturesManager() {
                    public String savePicture(byte[] content, PictureType pictureType, String suggestedName, float widthInches, float heightInches) {
                        File imgPath = new File(docsTempImages);
                        if (!imgPath.exists()) {//图片目录不存在则创建
                            imgPath.mkdirs();
                        }
                        File file = new File(docsTempImages + suggestedName);
                        try {
                            OutputStream os = new FileOutputStream(file);
                            os.write(content);
                            os.close();
                        } catch (FileNotFoundException e) {
                            e.printStackTrace();
                        } catch (IOException e) {
                            e.printStackTrace();
                        }
                        return docsTempImages + suggestedName;
                    }
                });
                //解析word文档
                wordToHtmlConverter.processDocument(wordDocument);
                Document document = wordToHtmlConverter.getDocument();
                // 也可以使用字符数组流获取解析的内容
                ByteArrayOutputStream baos = new ByteArrayOutputStream();
                DOMSource domSource = new DOMSource(document);
                StreamResult streamResult = new StreamResult(baos);
                TransformerFactory factory = TransformerFactory.newInstance();
                Transformer serializer = factory.newTransformer();
//                serializer.setOutputProperty(OutputKeys.ENCODING, "utf-8");
                serializer.setOutputProperty(OutputKeys.ENCODING, "gb2312");
                serializer.setOutputProperty(OutputKeys.INDENT, "yes");
                serializer.setOutputProperty(OutputKeys.METHOD, "html");
                serializer.transform(domSource, streamResult);
                baos.close();
                // 也可以使用字符数组流获取解析的内容
                return new String(baos.toByteArray());
            } else {
                logger.error("Enter only MS Office 2003 files");
                return null;
            }
        }
    }
    /**
     * 获取word中的图片名称和本地url(doc或docx)
     * 返回map<图片名称, 存储的图片url地址>
     *
     * @param uploadPath     图片存放路径
     * @param docsTempImages 本地临时图片存放地址(这个工具类Word2003ToHtmlAndSaveImage的方法存到了系统临时文件夹里)
     * @param file
     * @return
     * @throws IOException
     */
    public static Map<String, String> getImageMaps(String uploadPath, String docsTempImages, MultipartFile file) throws IOException {
        //返回map
        HashMap<String, String> map = new HashMap<>();
        if (file.getOriginalFilename().endsWith(".docx") || file.getOriginalFilename().endsWith(".DOCX")) {
            //获取存在word里的图片文件
            InputStream in = file.getInputStream();
            XWPFDocument document = new XWPFDocument(in);
            List<XWPFParagraph> paragraphs = document.getParagraphs();
            if (CollUtil.isNotEmpty(paragraphs)) {
                paragraphs.forEach(p -> {
                    List<XWPFRun> runs = p.getRuns();
                    if (CollUtil.isNotEmpty(runs)) {
                        runs.forEach(r -> {
                            List<XWPFPicture> pictures = r.getEmbeddedPictures();
                            if (CollUtil.isNotEmpty(pictures)) {
                                pictures.forEach(c -> {
                                    //这里找到word中的图片的名字和数据
                                    XWPFPictureData pictureData = c.getPictureData();
                                    String fileName = pictureData.getFileName();
                                    byte[] data = pictureData.getData();
                                    //保存到本地获取url
                                    String localUrl = saveImageToLocalWithByte(fileName, data, uploadPath);
                                    map.put(pictureData.getFileName(), localUrl);
                                });
                            }
                        });
                    }
                });
            }
        } else if (file.getOriginalFilename().endsWith(".doc") || file.getOriginalFilename().endsWith(".DOC")) {
            try {
                File dir = new File(docsTempImages);
                //如果目录不为空遍历存储到项目中
                if (!FileUtil.isEmpty(dir)) {
                    Arrays.asList(FileUtil.ls(docsTempImages)).forEach(f -> {
                        String name = f.getName();
                        BufferedInputStream inputStream = FileUtil.getInputStream(f);
                        String localUrl = saveImageToLocalWithStream(name, inputStream, uploadPath);
                        map.put(name, localUrl);
                    });
                }
            } finally {
                //删除临时文件夹
                FileUtil.del(docsTempImages);
            }
        }
        return map;
    }
    /**
     * 保存图片到项目中,返回路径(byte[])
     *
     * @param name       图片名字
     * @param data       图片字节数组
     * @param uploadPath 存储路径
     * @return
     */
    private static String saveImageToLocalWithByte(String name, byte[] data, String uploadPath) {
        FileUtil.writeBytes(data, uploadPath + name);
        //自己项目的ip和端口,html图片地址要用,或者根据自己需求指定存到什么地方,自定义
        String ipAndPort = "";
        return URLUtil.normalize(ipAndPort + name);
    }
    /**
     * 保存图片到项目中,返回路径(inputStream)
     *
     * @param name        图片名字
     * @param inputStream 输入流
     * @param uploadPath  存储路径
     * @return
     */
    private static String saveImageToLocalWithStream(String name, InputStream inputStream, String uploadPath) {
        savePic(uploadPath, inputStream, name);
        //自己项目的ip和端口,html图片地址要用,或者根据自己需求指定存到什么地方,自定义
        String ipAndPort = "";
        return URLUtil.normalize(ipAndPort + name);
    }
    /**
     * 保存图片
     *
     * @param path        存储路径
     * @param inputStream 输入流
     * @param fileName    文件名称
     */
    private static void savePic(String path, InputStream inputStream, String fileName) {
        OutputStream os = null;
        try {
            // 2、保存到临时文件
            // 1K的数据缓冲
            byte[] bs = new byte[1024];
            // 读取到的数据长度
            int len;
            // 输出的文件流保存到本地文件
            File tempFile = new File(path);
            if (!tempFile.exists()) {
                tempFile.mkdirs();
            }
            os = new FileOutputStream(tempFile.getPath() + File.separator + fileName);
            // 开始读取
            while ((len = inputStream.read(bs)) != -1) {
                os.write(bs, 0, len);
            }
        } catch (IOException e) {
            e.printStackTrace();
        } catch (Exception e) {
            e.printStackTrace();
        } finally {
            // 完毕,关闭所有链接
            try {
                os.close();
                inputStream.close();
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
    }
    static FileItem createFileItem(String filePath) {
        FileItemFactory factory = new DiskFileItemFactory(16, null);
        String textFieldName = "textField";
        int num = filePath.lastIndexOf(".");
        String extFile = filePath.substring(num);
        String path = filePath.substring(0, num);
        path = path.replace("\\", "/");
        String[] fileNames = path.split("/");
        String fileName = fileNames[fileNames.length - 1];
        FileItem item = factory.createItem(textFieldName, "text/plain", true, fileName + extFile);
        File newfile = new File(filePath);
        int bytesRead = 0;
        byte[] buffer = new byte[8192];
        try {
            FileInputStream fis = new FileInputStream(newfile);
            OutputStream os = item.getOutputStream();
            while ((bytesRead = fis.read(buffer, 0, 8192)) != -1) {
                os.write(buffer, 0, bytesRead);
            }
            os.close();
            fis.close();
        } catch (IOException e) {
            e.printStackTrace();
        }
        return item;
    }
    public static void main(String[] args) throws Exception {
        String path1 = "/Users/name/Downloads/全款合同.doc";
        String path2 = "/Users/name/Downloads/买卖合同.docx";
        MultipartFile file1 = new CommonsMultipartFile(createFileItem(path1));
        MultipartFile file2 = new CommonsMultipartFile(createFileItem(path2));
        System.out.println(WordToHtmlUtil.Word2003ToHtml(file1));
        // 生成2007
        WordToHtmlUtil.Word2007ToHtml(file2);
    }
}
  1. 页面调整
字体:font-family:SimSun;    font-family:Times New Roman
表格宽度:.t1{width:100%;}
表格td样式 去掉:width:1.1131945in;
body: style="width: 72%;margin: 0 auto;line-height: 150%;"


相关文章
|
数据采集 存储 搜索推荐
用 Python 将 html 转为 pdf、word
在日常中有时需将 html 文件转换为 pdf、word 文件。网上免费的大多数不支持多个文件转换的情况,而且在转换几个后就开始收费了。
1269 0
用 Python 将 html 转为 pdf、word
|
6月前
使用LabVIEW打开默认应用程序中的文档(PDF,Word,Excel,Html)
使用LabVIEW的&quot;Open a Document on Disk.vi&quot;,存于&lt;LabVIEW&gt;\vi.lib\Platform\browser.llb,可让默认应用打开硬盘文档。此VI仅基础打开功能,高级控制推荐LabVIEW Report Generation Toolkit或ActiveX。注意:避免版本升级问题,最好将VI复制到vi.lib外的目录。
277 3
|
6月前
|
Java Maven
Java在线预览(word转html)--强势推荐
Java在线预览(word转html)--强势推荐
162 0
|
Java Apache
java word转html 报错org/apache/poi/xwpf/usermodel/IRunBody
java word转html 报错org/apache/poi/xwpf/usermodel/IRunBody
279 0
|
数据库
wangEditor富文本编辑器的调用开发实录2(V5版本自定义粘贴,去除复制word或网页html冗余样式代码的解决方案)
wangEditor富文本编辑器的调用开发实录2(V5版本自定义粘贴,去除复制word或网页html冗余样式代码的解决方案)
797 0
|
开发者
利用word制作html无法直接绘制的漂亮表格
利用word制作html无法直接绘制的漂亮表格
|
Python
Python 技术篇 - 使用pypandoc库实现html文档转word文档实例演示
Python 技术篇 - 使用pypandoc库实现html文档转word文档实例演示
483 0
Python 技术篇 - 使用pypandoc库实现html文档转word文档实例演示
|
Java 程序员 API
Java:Java的jar包之POI的简介、安装、使用方法(基于POI将Word、Excel、PPT转换为html)之详细攻略
Java:Java的jar包之POI的简介、安装、使用方法(基于POI将Word、Excel、PPT转换为html)之详细攻略
|
存储 数据可视化 Java
下一篇
DataWorks