javaword文档转html文件

javaword⽂档转html⽂件

⼀、简介

⼀般word⽂件后缀有doc、docx两种。docx是office word 2007以及以后版本⽂档的扩展名；doc是office word 2003⽂档保存的扩展名。对于这两种格式的word转换成html需要使⽤不同的⽅法。

对于docx格式的⽂档使⽤xdocreport进⾏转换。依赖如下：

<groupId>fr.opensagres.xdocreport</groupId>

<artifactId>fr.opensagres.xdocreport.document</artifactId>

</dependency>

<groupId>fr.opensagres.xdocreport</groupId>

<artifactId>org.apache.verter.xhtml</artifactId>

</dependency>

对于docx格式的⽂档使⽤poi进⾏转换。依赖如下：

<groupId>org.apache.poi</groupId>

</dependency>

<groupId>org.apache.poi</groupId>

<artifactId>poi-scratchpad</artifactId>

</dependency>

⼆：⽰例

代码⽰例如下：

st.word;

3import java.io.File;

4import java.io.FileInputStream;

5import java.io.FileNotFoundException;

6import java.io.FileOutputStream;

7import java.io.IOException;

8import java.io.InputStream;

9import java.io.OutputStream;

l.parsers.DocumentBuilderFactory;

l.parsers.ParserConfigurationException;

l.transform.OutputKeys;

l.transform.Transformer;

l.transform.TransformerException;

l.transform.TransformerFactory;

l.transform.dom.DOMSource;

l.transform.stream.StreamResult;

20import org.apache.poi.hwpf.HWPFDocument;

21import org.apache.verter.PicturesManager;

22import org.apache.verter.WordToHtmlConverter;

23import org.apache.poi.hwpf.usermodel.PictureType;

24import org.apache.FileImageExtractor;

25import org.apache.FileURIResolver;

26import org.apache.verter.xhtml.XHTMLConverter;

27import org.apache.verter.xhtml.XHTMLOptions;

28import org.apache.poi.xwpf.usermodel.XWPFDocument;

29import org.junit.Test;

30import org.w3c.dom.Document;

32/**

33 * word 转换成html

34*/

35public class WordToHtml {

37/**

38 * 2007版本word转换成html

39 * @throws IOException

40*/

41 @Test

42public void Word2007ToHtml() throws IOException {

43 String filepath = "C:/test/";

44 String fileName = "滕王阁序2007.docx";

45 String htmlName = "滕王阁序2007.html";

46final String file = filepath + fileName;

47 File f = new File(file);

48if (!f.exists()) {

49 System.out.println("Sorry File does not Exists!");

50 } else {

51if (f.getName().endsWith(".docx") || f.getName().endsWith(".DOCX")) {

53// 1) 加载word⽂档⽣成 XWPFDocument对象

54 InputStream in = new FileInputStream(f);

55 XWPFDocument document = new XWPFDocument(in);

57// 2) 解析 XHTML配置 (这⾥设置IURIResolver来设置图⽚存放的⽬录)

58 File imageFolderFile = new File(filepath);

59 XHTMLOptions options = ate().URIResolver(new FileURIResolver(imageFolderFile));

60 options.setExtractor(new FileImageExtractor(imageFolderFile));

61 options.setIgnoreStylesIfUnused(false);

62 options.setFragment(true);

64// 3) 将 XWPFDocument转换成XHTML

65 OutputStream out = new FileOutputStream(new File(filepath + htmlName));

66 Instance().convert(document, out, options);

68//也可以使⽤字符数组流获取解析的内容

69// ByteArrayOutputStream baos = new ByteArrayOutputStream();

70// Instance().convert(document, baos, options);

71// String content = String();

72// System.out.println(content);

73// baos.close();

74 } else {

75 System.out.println("Enter only MS Office 2007+ files");

76 }

77 }

78 }

80/**

81 * /**

82 * 2003版本word转换成html

83 * @throws IOException

84 * @throws TransformerException

85 * @throws ParserConfigurationException

86*/

87 @Test

88public void Word2003ToHtml() throws IOException, TransformerException, ParserConfigurationException {

89 String filepath = "C:/test/";

90final String imagepath = "C:/test/image/";

91 String fileName = "滕王阁序2003.doc";

92 String htmlName = "滕王阁序2003.html";

93final String file = filepath + fileName;

94 InputStream input = new FileInputStream(new File(file));

95 HWPFDocument wordDocument = new HWPFDocument(input);

96 WordToHtmlConverter wordToHtmlConverter = new wInstance().newDocumentBuilder().newDocument()); 97//设置图⽚存放的位置

98 wordToHtmlConverter.setPicturesManager(new PicturesManager() {

99public String savePicture(byte[] content, PictureType pictureType, String suggestedName, float widthInches, float heightInches) {

100 File imgPath = new File(imagepath);

101if(!ists()){//图⽚⽬录不存在则创建

102 imgPath.mkdirs();

103 }

pdf转html

104 File file = new File(imagepath + suggestedName);

105try {

106 OutputStream os = new FileOutputStream(file);

107 os.write(content);

108 os.close();

109 } catch (FileNotFoundException e) {

110 e.printStackTrace();

111 } catch (IOException e) {

112 e.printStackTrace();

113 }

114return imagepath + suggestedName;

115 }

116 });

117

118//解析word⽂档

119 wordToHtmlConverter.processDocument(wordDocument);

120 Document htmlDocument = Document();

121

122 File htmlFile = new File(filepath + htmlName);

123 OutputStream outStream = new FileOutputStream(htmlFile);

124

125//也可以使⽤字符数组流获取解析的内容

126// ByteArrayOutputStream baos = new ByteArrayOutputStream();

127// OutputStream outStream = new BufferedOutputStream(baos);

128

129 DOMSource domSource = new DOMSource(htmlDocument);

130 StreamResult streamResult = new StreamResult(outStream);

131

132 TransformerFactory factory = wInstance(); 133 Transformer serializer = wTransformer();

134 serializer.setOutputProperty(OutputKeys.ENCODING, "utf-8"); 135 serializer.setOutputProperty(OutputKeys.INDENT, "yes");

136 serializer.setOutputProperty(OutputKeys.METHOD, "html"); 137

138 ansform(domSource, streamResult);

139

140//也可以使⽤字符数组流获取解析的内容

141// String content = String();

142// System.out.println(content);

143// baos.close();

144 outStream.close();

145 }

146 }

运⾏⽣存⽂件结果如下：

javaword文档转html文件

发布评论取消回复

最近发表

热门文章

标签列表