javaword⽂档转html⽂件
⼀、简介
  ⼀般word⽂件后缀有doc、docx两种。docx是office word 2007以及以后版本⽂档的扩展名;doc是office word 2003⽂档保存的扩展名。对于这两种格式的word转换成html需要使⽤不同的⽅法。
对于docx格式的⽂档使⽤xdocreport进⾏转换。依赖如下:
<dependency>
<groupId>fr.opensagres.xdocreport</groupId>
<artifactId>fr.opensagres.xdocreport.document</artifactId>
<version>1.0.5</version>
</dependency>
<dependency>
<groupId>fr.opensagres.xdocreport</groupId>
<artifactId>org.apache.verter.xhtml</artifactId>
<version>1.0.5</version>
</dependency>
对于docx格式的⽂档使⽤poi进⾏转换。依赖如下:
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi</artifactId>
<version>3.12</version>
</dependency>
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-scratchpad</artifactId>
<version>3.12</version>
</dependency>
⼆:⽰例
  代码⽰例如下:
st.word;
2
3import java.io.File;
4import java.io.FileInputStream;
5import java.io.FileNotFoundException;
6import java.io.FileOutputStream;
7import java.io.IOException;
8import java.io.InputStream;
9import java.io.OutputStream;
10
l.parsers.DocumentBuilderFactory;
l.parsers.ParserConfigurationException;
l.transform.OutputKeys;
l.transform.Transformer;
l.transform.TransformerException;
l.transform.TransformerFactory;
l.transform.dom.DOMSource;
l.transform.stream.StreamResult;
19
20import org.apache.poi.hwpf.HWPFDocument;
21import org.apache.verter.PicturesManager;
22import org.apache.verter.WordToHtmlConverter;
23import org.apache.poi.hwpf.usermodel.PictureType;
24import org.apache.FileImageExtractor;
25import org.apache.FileURIResolver;
26import org.apache.verter.xhtml.XHTMLConverter;
27import org.apache.verter.xhtml.XHTMLOptions;
28import org.apache.poi.xwpf.usermodel.XWPFDocument;
29import org.junit.Test;
30import org.w3c.dom.Document;
31
32/**
33 * word 转换成html
34*/
35public class WordToHtml {
36
37/**
38    * 2007版本word转换成html
39    * @throws IOException
40*/
41    @Test
42public void Word2007ToHtml() throws IOException {
43        String filepath = "C:/test/";
44        String fileName = "滕王阁序2007.docx";
45        String htmlName = "滕王阁序2007.html";
46final String file = filepath + fileName;
47        File f = new File(file);
48if (!f.exists()) {
49            System.out.println("Sorry File does not Exists!");
50        } else {
51if (f.getName().endsWith(".docx") || f.getName().endsWith(".DOCX")) {
52
53// 1) 加载word⽂档⽣成 XWPFDocument对象
54                InputStream in = new FileInputStream(f);
55                XWPFDocument document = new XWPFDocument(in);
56
57// 2) 解析 XHTML配置 (这⾥设置IURIResolver来设置图⽚存放的⽬录)
58                File imageFolderFile = new File(filepath);
59                XHTMLOptions options = ate().URIResolver(new FileURIResolver(imageFolderFile));
60                options.setExtractor(new FileImageExtractor(imageFolderFile));
61                options.setIgnoreStylesIfUnused(false);
62                options.setFragment(true);
63
64// 3) 将 XWPFDocument转换成XHTML
65                OutputStream out = new FileOutputStream(new File(filepath + htmlName));
66                Instance().convert(document, out, options);
67
68//也可以使⽤字符数组获取解析的内容
69//                ByteArrayOutputStream baos = new ByteArrayOutputStream();
70//                Instance().convert(document, baos, options);
71//                String content = String();
72//                System.out.println(content);
73//                baos.close();
74            } else {
75                System.out.println("Enter only MS Office 2007+ files");
76            }
77        }
78    }
79
80/**
81    * /**
82    * 2003版本word转换成html
83    * @throws IOException
84    * @throws TransformerException
85    * @throws ParserConfigurationException
86*/
87    @Test
88public void Word2003ToHtml() throws IOException, TransformerException, ParserConfigurationException {
89        String filepath = "C:/test/";
90final String imagepath = "C:/test/image/";
91        String fileName = "滕王阁序2003.doc";
92        String htmlName = "滕王阁序2003.html";
93final String file = filepath + fileName;
94        InputStream input = new FileInputStream(new File(file));
95        HWPFDocument wordDocument = new HWPFDocument(input);
96        WordToHtmlConverter wordToHtmlConverter = new wInstance().newDocumentBuilder().newDocument()); 97//设置图⽚存放的位置
98        wordToHtmlConverter.setPicturesManager(new PicturesManager() {
99public String savePicture(byte[] content, PictureType pictureType, String suggestedName, float widthInches, float heightInches) {
100                File imgPath = new File(imagepath);
101if(!ists()){//图⽚⽬录不存在则创建
102                    imgPath.mkdirs();
103                }
pdf转html
104                File file = new File(imagepath + suggestedName);
105try {
106                    OutputStream os = new FileOutputStream(file);
107                    os.write(content);
108                    os.close();
109                } catch (FileNotFoundException e) {
110                    e.printStackTrace();
111                } catch (IOException e) {
112                    e.printStackTrace();
113                }
114return imagepath + suggestedName;
115            }
116        });
117
118//解析word⽂档
119        wordToHtmlConverter.processDocument(wordDocument);
120        Document htmlDocument = Document();
121
122        File htmlFile = new File(filepath + htmlName);
123        OutputStream outStream = new FileOutputStream(htmlFile);
124
125//也可以使⽤字符数组流获取解析的内容
126//        ByteArrayOutputStream baos = new ByteArrayOutputStream();
127//        OutputStream outStream = new BufferedOutputStream(baos);
128
129        DOMSource domSource = new DOMSource(htmlDocument);
130        StreamResult streamResult = new StreamResult(outStream);
131
132        TransformerFactory factory = wInstance(); 133        Transformer serializer = wTransformer();
134        serializer.setOutputProperty(OutputKeys.ENCODING, "utf-8"); 135        serializer.setOutputProperty(OutputKeys.INDENT, "yes");
136        serializer.setOutputProperty(OutputKeys.METHOD, "html"); 137
138        ansform(domSource, streamResult);
139
140//也可以使⽤字符数组流获取解析的内容
141//        String content = String();
142//        System.out.println(content);
143//        baos.close();
144        outStream.close();
145    }
146 }
  运⾏⽣存⽂件结果如下: