javaword⽂档转html⽂件
⼀、简介
⼀般word⽂件后缀有doc、docx两种。docx是office word 2007以及以后版本⽂档的扩展名;doc是office word 2003⽂档保存的扩展名。对于这两种格式的word转换成html需要使⽤不同的⽅法。
对于docx格式的⽂档使⽤xdocreport进⾏转换。依赖如下:
<dependency>
<groupId>fr.opensagres.xdocreport</groupId>
<artifactId>fr.opensagres.xdocreport.document</artifactId>
<version>1.0.5</version>
</dependency>
<dependency>
<groupId>fr.opensagres.xdocreport</groupId>
<artifactId>org.apache.verter.xhtml</artifactId>
<version>1.0.5</version>
</dependency>
对于docx格式的⽂档使⽤poi进⾏转换。依赖如下:
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi</artifactId>
<version>3.12</version>
</dependency>
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-scratchpad</artifactId>
<version>3.12</version>
</dependency>
⼆:⽰例
代码⽰例如下:
st.word;
2
3import java.io.File;
4import java.io.FileInputStream;
5import java.io.FileNotFoundException;
6import java.io.FileOutputStream;
7import java.io.IOException;
8import java.io.InputStream;
9import java.io.OutputStream;
10
l.parsers.DocumentBuilderFactory;
l.parsers.ParserConfigurationException;
l.transform.OutputKeys;
l.transform.Transformer;
l.transform.TransformerException;
l.transform.TransformerFactory;
l.transform.dom.DOMSource;
l.transform.stream.StreamResult;
19
20import org.apache.poi.hwpf.HWPFDocument;
21import org.apache.verter.PicturesManager;
22import org.apache.verter.WordToHtmlConverter;
23import org.apache.poi.hwpf.usermodel.PictureType;
24import org.apache.FileImageExtractor;
25import org.apache.FileURIResolver;
26import org.apache.verter.xhtml.XHTMLConverter;
27import org.apache.verter.xhtml.XHTMLOptions;
28import org.apache.poi.xwpf.usermodel.XWPFDocument;
29import org.junit.Test;
30import org.w3c.dom.Document;
31
32/**
33 * word 转换成html
34*/
35public class WordToHtml {
36
37/**
38 * 2007版本word转换成html
39 * @throws IOException
40*/
41 @Test
42public void Word2007ToHtml() throws IOException {
43 String filepath = "C:/test/";
44 String fileName = "滕王阁序2007.docx";
45 String htmlName = "滕王阁序2007.html";
46final String file = filepath + fileName;
47 File f = new File(file);
48if (!f.exists()) {
49 System.out.println("Sorry File does not Exists!");
50 } else {
51if (f.getName().endsWith(".docx") || f.getName().endsWith(".DOCX")) {
52
53// 1) 加载word⽂档⽣成 XWPFDocument对象
54 InputStream in = new FileInputStream(f);
55 XWPFDocument document = new XWPFDocument(in);
56
58 File imageFolderFile = new File(filepath);
59 XHTMLOptions options = ate().URIResolver(new FileURIResolver(imageFolderFile));
60 options.setExtractor(new FileImageExtractor(imageFolderFile));
61 options.setIgnoreStylesIfUnused(false);
62 options.setFragment(true);
63
64// 3) 将 XWPFDocument转换成XHTML
65 OutputStream out = new FileOutputStream(new File(filepath + htmlName));
66 Instance().convert(document, out, options);
67
69// ByteArrayOutputStream baos = new ByteArrayOutputStream();
70// Instance().convert(document, baos, options);
71// String content = String();
72// System.out.println(content);
73// baos.close();
74 } else {
75 System.out.println("Enter only MS Office 2007+ files");
76 }
77 }
78 }
79
80/**
81 * /**
82 * 2003版本word转换成html
83 * @throws IOException
84 * @throws TransformerException
85 * @throws ParserConfigurationException
86*/
87 @Test
88public void Word2003ToHtml() throws IOException, TransformerException, ParserConfigurationException {
89 String filepath = "C:/test/";
90final String imagepath = "C:/test/image/";
91 String fileName = "滕王阁序2003.doc";
92 String htmlName = "滕王阁序2003.html";
93final String file = filepath + fileName;
94 InputStream input = new FileInputStream(new File(file));
95 HWPFDocument wordDocument = new HWPFDocument(input);
96 WordToHtmlConverter wordToHtmlConverter = new wInstance().newDocumentBuilder().newDocument()); 97//设置图⽚存放的位置
98 wordToHtmlConverter.setPicturesManager(new PicturesManager() {
99public String savePicture(byte[] content, PictureType pictureType, String suggestedName, float widthInches, float heightInches) {
100 File imgPath = new File(imagepath);
101if(!ists()){//图⽚⽬录不存在则创建
102 imgPath.mkdirs();
103 }
pdf转html104 File file = new File(imagepath + suggestedName);
105try {
106 OutputStream os = new FileOutputStream(file);
107 os.write(content);
108 os.close();
109 } catch (FileNotFoundException e) {
110 e.printStackTrace();
111 } catch (IOException e) {
112 e.printStackTrace();
113 }
114return imagepath + suggestedName;
115 }
116 });
117
118//解析word⽂档
119 wordToHtmlConverter.processDocument(wordDocument);
120 Document htmlDocument = Document();
121
122 File htmlFile = new File(filepath + htmlName);
123 OutputStream outStream = new FileOutputStream(htmlFile);
124
125//也可以使⽤字符数组流获取解析的内容
126// ByteArrayOutputStream baos = new ByteArrayOutputStream();
127// OutputStream outStream = new BufferedOutputStream(baos);
128
129 DOMSource domSource = new DOMSource(htmlDocument);
130 StreamResult streamResult = new StreamResult(outStream);
131
132 TransformerFactory factory = wInstance(); 133 Transformer serializer = wTransformer();
134 serializer.setOutputProperty(OutputKeys.ENCODING, "utf-8"); 135 serializer.setOutputProperty(OutputKeys.INDENT, "yes");
136 serializer.setOutputProperty(OutputKeys.METHOD, "html"); 137
138 ansform(domSource, streamResult);
139
140//也可以使⽤字符数组流获取解析的内容
141// String content = String();
142// System.out.println(content);
143// baos.close();
144 outStream.close();
145 }
146 }
运⾏⽣存⽂件结果如下:
发布评论