如题的需求,需要用Java将word的doc(docx)文档转成html,另外word文档中,还有一些图片,需要将图片转成base保存在html中。
需要用到的框架如下:
相关的maven依赖如下:
pom.xml
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi</artifactId>
<version>5.2.3</version>
</dependency>
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-ooxml</artifactId>
<version>5.2.3</version>
</dependency>
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-scratchpad</artifactId>
<version>5.2.3</version>
</dependency>
<dependency>
<groupId>fr.opensagres.xdocreport</groupId>
<artifactId>fr.opensagres.xdocreport.document</artifactId>
<version>2.0.1</version>
</dependency>
<dependency>
<groupId>fr.opensagres.xdocreport</groupId>
<artifactId>fr.opensagres.poi.xwpf.converter.xhtml</artifactId>
<version>2.0.1</version>
</dependency>
直接doc转html代码如下:
public class Test {
public static void main(String[] args) throws Exception {
//ps:当inputStream!=null,而生成wordDocument报错,请检查文档是否用office word保存的
HWPFDocument wordDocument = (HWPFDocument) WordToHtmlUtils.loadDoc(new File("/Users/Terry/Downloads/word.doc"));
WordToHtmlConverter wordToHtmlConverter = new WordToHtmlConverter(
DocumentBuilderFactory.newInstance().newDocumentBuilder().newDocument()
);
//将图片转成base64的格式
PicturesManager pictureRunMapper = (bytes, pictureType, s, v, v1) -> "data:image/png;base64," + Base64.encodeBase64String(bytes);
wordToHtmlConverter.setPicturesManager(pictureRunMapper);
//解析word文档
wordToHtmlConverter.processDocument(wordDocument);
Document htmlDocument = wordToHtmlConverter.getDocument();
DOMSource domSource = new DOMSource(htmlDocument);
StreamResult streamResult = new StreamResult(new FileOutputStream("/Users/Terry/Downloads/dest.html"));
TransformerFactory factory = TransformerFactory.newInstance();
Transformer serializer = factory.newTransformer();
serializer.setOutputProperty(OutputKeys.ENCODING, "utf-8");
serializer.setOutputProperty(OutputKeys.INDENT, "yes");
serializer.setOutputProperty(OutputKeys.METHOD, "html");
serializer.transform(domSource, streamResult);
}
}
文章评论