word轉html 和pdf


今天有個新的需求,就是要把word進行預覽,為了實現打印,需要轉成pdf或html,在網上找了一些方法,這里做個記錄

 

首先是轉html,看起來挺簡單的
 
首先是兩個maven包
 
<!-- java word文檔 轉 html文件 -->
<dependency>
<groupId>fr.opensagres.xdocreport</groupId>
<artifactId>fr.opensagres.xdocreport.document</artifactId>
<version>1.0.5</version>
</dependency>
<dependency>
<groupId>fr.opensagres.xdocreport</groupId>
<artifactId>org.apache.poi.xwpf.converter.xhtml</artifactId>
<version>1.0.5</version>
</dependency>
 
 
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi</artifactId>
<version>3.12</version>
</dependency>
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-scratchpad</artifactId>
<version>3.12</version>
</dependency>
 
然后就是轉換demo
package b2b.cn.util;
 
 
 
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
 
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.transform.OutputKeys;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerException;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;
 
import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.converter.PicturesManager;
import org.apache.poi.hwpf.converter.WordToHtmlConverter;
import org.apache.poi.hwpf.usermodel.PictureType;
import org.apache.poi.xwpf.converter.core.FileImageExtractor;
import org.apache.poi.xwpf.converter.core.FileURIResolver;
import org.apache.poi.xwpf.converter.xhtml.XHTMLConverter;
import org.apache.poi.xwpf.converter.xhtml.XHTMLOptions;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
import org.junit.Test;
import org.w3c.dom.Document;
 
/**
* word 轉換成html
*/
public class GoHTML {
 
public static void main(String[] args) {
try {
// new GoHTML().Word2003ToHtml(); //doc
 
new GoHTML().Word2007ToHtml();
 
 
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
 
/**
* 2007版本word轉換成html
* @throws IOException
*/
@Test
public void Word2007ToHtml() throws IOException {
String filepath = "E:/test/";
String fileName = "demo.docx";
String htmlName = "123.html";
final String file = filepath + fileName;
File f = new File(file);
if (!f.exists()) {
System.out.println("Sorry File does not Exists!");
} else {
if (f.getName().endsWith(".docx") || f.getName().endsWith(".DOCX")) {
 
// 1) 加載word文檔生成 XWPFDocument對象
InputStream in = new FileInputStream(f);
XWPFDocument document = new XWPFDocument(in);
 
// 2) 解析 XHTML配置 (這里設置IURIResolver來設置圖片存放的目錄)
File imageFolderFile = new File(filepath);
XHTMLOptions options = XHTMLOptions.create().URIResolver(new FileURIResolver(imageFolderFile));
options.setExtractor(new FileImageExtractor(imageFolderFile));
options.setIgnoreStylesIfUnused(false);
options.setFragment(true);
 
// 3) 將 XWPFDocument轉換成XHTML
OutputStream out = new FileOutputStream(new File(filepath + htmlName));
XHTMLConverter.getInstance().convert(document, out, options);
 
//也可以使用字符數組流獲取解析的內容
// ByteArrayOutputStream baos = new ByteArrayOutputStream();
// XHTMLConverter.getInstance().convert(document, baos, options);
// String content = baos.toString();
// System.out.println(content);
// baos.close();
} else {
System.out.println("Enter only MS Office 2007+ files");
}
}
}
 
/**
* /**
* 2003版本word轉換成html
* @throws IOException
* @throws TransformerException
* @throws ParserConfigurationException
*/
@Test
public void Word2003ToHtml() throws IOException, TransformerException, ParserConfigurationException {
final String imagepath = "F:/test/image/";//解析時候如果doc文件中有圖片 圖片會保存在此路徑
String filepath = "F:/test/";
String fileName = "demo.doc";
String htmlName = "123.html";
final String file = filepath + fileName;
InputStream input = new FileInputStream(new File(file));
HWPFDocument wordDocument = new HWPFDocument(input);
WordToHtmlConverter wordToHtmlConverter = new WordToHtmlConverter(DocumentBuilderFactory.newInstance().newDocumentBuilder().newDocument());
//設置圖片存放的位置
wordToHtmlConverter.setPicturesManager(new PicturesManager() {
public String savePicture(byte[] content, PictureType pictureType, String suggestedName, float widthInches, float heightInches) {
File imgPath = new File(imagepath);
if(!imgPath.exists()){//圖片目錄不存在則創建
imgPath.mkdirs();
}
File file = new File(imagepath + suggestedName);
try {
OutputStream os = new FileOutputStream(file);
os.write(content);
os.close();
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
return imagepath + suggestedName;
}
});
 
//解析word文檔
wordToHtmlConverter.processDocument(wordDocument);
Document htmlDocument = wordToHtmlConverter.getDocument();
 
File htmlFile = new File(filepath + htmlName);
OutputStream outStream = new FileOutputStream(htmlFile);
 
//也可以使用字符數組流獲取解析的內容
// ByteArrayOutputStream baos = new ByteArrayOutputStream();
// OutputStream outStream = new BufferedOutputStream(baos);
 
DOMSource domSource = new DOMSource(htmlDocument);
StreamResult streamResult = new StreamResult(outStream);
 
TransformerFactory factory = TransformerFactory.newInstance();
Transformer serializer = factory.newTransformer();
serializer.setOutputProperty(OutputKeys.ENCODING, "utf-8");
serializer.setOutputProperty(OutputKeys.INDENT, "yes");
serializer.setOutputProperty(OutputKeys.METHOD, "html");
 
serializer.transform(domSource, streamResult);
 
//也可以使用字符數組流獲取解析的內容
// String content = baos.toString();
// System.out.println(content);
// baos.close();
outStream.close();
}
 
}
我只測試了docx ,沒有問題
 
 
但是轉pdf出現了一點小麻煩
 
這個方法是網上很多人都在用的
<!-- 轉pdf -->
<!-- https://mvnrepository.com/artifact/com.aspose/aspose-words -->
<dependency>
<groupId>com.aspose.words</groupId>
<artifactId>aspose-words-jdk16</artifactId>
<version>15.8.0</version>
</dependency>
 
package b2b.cn.util;
 
import java.io.File;
import java.io.FileOutputStream;
import java.io.InputStream;
 
import org.junit.Test;
 
import com.aspose.words.Document;
import com.aspose.words.License;
import com.aspose.words.SaveFormat;
/***
* 轉成pdf工具類
* 如果注釋掉驗證 轉化成功但是有水印
* @author Ic055
*
*/
public class GoPDF {
 
public static void main(String[] args) {
doc2pdf("E:/test/demo.docx");
}
public static boolean getLicense() {
boolean result = false;
try {
InputStream is = Test.class.getClassLoader().getResourceAsStream("license.xml"); // license.xml應放在..\WebRoot\WEB-INF\classes路徑下
License aposeLic = new License();
aposeLic.setLicense(is);
result = true;
} catch (Exception e) {
e.printStackTrace();
}
return result;
}
public static void doc2pdf(String Address) {
 
if (!getLicense()) {
// 驗證License 若不驗證則轉化出的pdf文檔會有水印產生 return; }
return;
 
}
try {
long old = System.currentTimeMillis();
File file = new File("E:/demo11.pdf"); //新建一個空白pdf文檔
FileOutputStream os = new FileOutputStream(file);
Document doc = new Document(Address); //Address是將要被轉化的word文檔
doc.save(os, SaveFormat.PDF);//全面支持DOC, DOCX, OOXML, RTF HTML, OpenDocument, PDF, EPUB, XPS, SWF 相互轉換
long now = System.currentTimeMillis();
System.out.println("共耗時:" + ((now - old) / 1000.0) + "秒"); //轉化用時
} catch (Exception e) {
e.printStackTrace();
}
 
}
 
 
 
}
 
 
 
license.xml
 
<License>
<Data>
<Products>
<Product>Aspose.Total for Java</Product>
<Product>Aspose.Words for Java</Product>
</Products>
<EditionType>Enterprise</EditionType>
<SubscriptionExpiry>20991231</SubscriptionExpiry>
<LicenseExpiry>20991231</LicenseExpiry>
<SerialNumber>8bfe198c-7f0c-4ef8-8ff0-acc3237bf0d7</SerialNumber>
</Data>
<Signature>sNLLKGMUdF0r8O1kKilWAGdgfs2BvJb/2Xp8p5iuDVfZXmhppo+d0Ran1P9TKdjV4ABwAgKXxJ3jcQTqE/2IRfqwnPf8itN8aFZlV3TJPYeD3yWE7IT55Gz6EijUpC7aKeoohTb4w2fpox58wWoF3SNp6sK6jDfiAUGEHYJ9pjU=</Signature>
</License>
 
在網上當的絕大多數教程中 這個文件應該放在WebRoot/WEB-INFO/classes目錄下
但是發現一直找不到這個文件,后來偶然看到,對於maven項目來說,應該放在resouce目錄下
 
還有一個小問題 就是

<dependency>
<groupId>com.aspose.words</groupId>
<artifactId>aspose-words-jdk16</artifactId>
<version>15.8.0</version>
</dependency>

這個可能會報錯,所以呢,這個要本地下載jar資源

 

 

aspose-words-15.8.0-jdk16 我這個是在網上找到的資源 然后用eclipse添加到maven倉庫就可以用啦

我放在百度網盤分享給大家

鏈接:https://pan.baidu.com/s/1DncAhgqUqfELv193WtTcDQ
提取碼:q41z

除了用eclipse添加到maven倉庫,我見到還有一種處理方式

 

  • 內置屬性:主要有兩個常用內置屬性——${basedir}表示項目根目錄,即包含pom.xml文件的目錄;${version}表示項目版本。
 

 

 

 

 

 
 
 
 
 
 
路徑
 
 


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM