Java:將 rtf 或 doc 轉成 html 格式


什么是 rtf 格式? rtf 是一種富文本格式 Rich Text Format,可以包含文字 圖片 等內容。rtf 可以用 word 或者 wps 直接打開,也可以用文本編輯器打開,如果用文本編輯器打開則可以顯示其源碼。rtf 源碼格式解析可以參考這里。 

用 Java 代碼解析 rtf 格式,可以用 Apache Tika 解析,且支持 rtf 格式,但是網上可參考的文檔較少。但是網上 doc 轉成 html 的參考文檔較多,因此可采用如下步驟:

  1. 將 rtf 轉成 doc 格式
  2. 將 doc 格式轉換成 html

步驟 1 較為簡單,可以先用 word 或者 wps 打開 rtf 文件,然后 文件 另存為 doc 即可。如果一個文件可以這樣操作,如果有多上百個文件這樣操作肯定較為繁瑣,可以查看這篇文章,批量將 rtf 另存為 doc 格式。

步驟 2 可以參考網上的這篇文章, 通過 Apache POI 將 doc 轉成 html 格式,且樣式圖片不會丟失。

對於步驟 2 中網上那篇文章將 doc 轉 html 時,提取了其中的圖片,然后 html 引入了圖片的相對位置。其實還有一種辦法,是將圖片轉成 base64 編碼,直接內嵌在 html 網頁中。代碼如下:

首先導入依賴,這里采用的是 poi 3.17 版本,其他版本也可以。

    <dependencies>
        <dependency>
            <groupId>org.apache.poi</groupId>
            <artifactId>poi</artifactId>
            <version>3.17</version>
        </dependency>
        <dependency>
            <groupId>org.apache.poi</groupId>
            <artifactId>poi-ooxml</artifactId>
            <version>3.17</version>
        </dependency>
        <dependency>
            <groupId>org.apache.poi</groupId>
            <artifactId>poi-scratchpad</artifactId>
            <version>3.17</version>
        </dependency>
    </dependencies>
View Code

然后寫工具類

package conv;

import java.io.BufferedWriter;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.util.Base64;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.transform.OutputKeys;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerException;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;

import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.converter.PicturesManager;
import org.apache.poi.hwpf.converter.WordToHtmlConverter;
import org.apache.poi.hwpf.usermodel.PictureType;
import org.w3c.dom.Document;

public class Word2HtmlAboutPic {
    public static void main(String argv[]) {
        try {
            convert2Html("D://2.doc", "D://2.html");
        } catch (Exception e) {
            e.printStackTrace();
        }
    }

    public static void writeFile(String content, String path) {
        FileOutputStream fos = null;
        BufferedWriter bw = null;
        try {
            File file = new File(path);
            fos = new FileOutputStream(file);
            bw = new BufferedWriter(new OutputStreamWriter(fos, "GB2312"));
            bw.write(content);
        } catch (FileNotFoundException fnfe) {
            fnfe.printStackTrace();
        } catch (IOException ioe) {
            ioe.printStackTrace();
        } finally {
            try {
                if (bw != null)
                    bw.close();
                if (fos != null)
                    fos.close();
            } catch (IOException ie) {
                ie.printStackTrace();
            }
        }
    }

    public static void write2File(byte[] content, String path) {
        FileOutputStream fos = null;
        BufferedWriter bw = null;
        try {
            File file = new File(path);
            fos = new FileOutputStream(file);
            fos.write(content);
            fos.close();
        } catch (FileNotFoundException fnfe) {
            fnfe.printStackTrace();
        } catch (IOException ioe) {
            ioe.printStackTrace();
        } finally {
            try {
                if (bw != null)
                    bw.close();
                if (fos != null)
                    fos.close();
            } catch (IOException ie) {
                ie.printStackTrace();
            }
        }
    }

    public static void convert2Html(String fileName, String outPutFile)
            throws TransformerException, IOException, ParserConfigurationException {

        Base64.Encoder encoder = Base64.getEncoder();

        HWPFDocument wordDocument = new HWPFDocument(new FileInputStream(fileName));

        WordToHtmlConverter wordToHtmlConverter = new WordToHtmlConverter(
                DocumentBuilderFactory.newInstance().newDocumentBuilder().newDocument());
        wordToHtmlConverter.setPicturesManager(new PicturesManager() {
            public String savePicture(byte[] content, PictureType pictureType, String suggestedName, float widthInches,
                    float heightInches) {
                String encodedText = new String(encoder.encode(content));
                String imgSrc = "data:" + pictureType.getMime() + ";" + "base64," + encodedText;
                return imgSrc;
            }
        });
        wordToHtmlConverter.processDocument(wordDocument);

        Document htmlDocument = wordToHtmlConverter.getDocument();
        ByteArrayOutputStream out = new ByteArrayOutputStream();
        DOMSource domSource = new DOMSource(htmlDocument);
        StreamResult streamResult = new StreamResult(out);

        TransformerFactory tf = TransformerFactory.newInstance();
        Transformer serializer = tf.newTransformer();
        serializer.setOutputProperty(OutputKeys.ENCODING, "UTF-8");
        serializer.setOutputProperty(OutputKeys.INDENT, "yes");
        serializer.setOutputProperty(OutputKeys.METHOD, "html");
        serializer.transform(domSource, streamResult);
        out.close();
        write2File(out.toByteArray(), outPutFile);
    }
}
View Code

然后批量轉換

package conv;

import java.io.File;

public class BatchWord2Html {

    public static void main(String[] args) throws Exception {
        String originalFolder = "D:/develop/temp/original";
        String destinationFolder = "D:/develop/temp/destination";
        File folder = new File(originalFolder);
        for (File fileEntry : folder.listFiles()) {
            if (fileEntry.isDirectory()) {
                System.out.println("Subfolders in the folder");
                break;
            }
            String filename = fileEntry.getName();
            String caselsh = filename.substring(0, filename.lastIndexOf("."));
            String outFileName = destinationFolder + "/" + caselsh + ".html";
            System.out.println(filename + " " + outFileName);
            Word2HtmlAboutPic.convert2Html(originalFolder + "/" + filename, outFileName);
        }
    }

}
View Code

 

 

參考文章:

rtf 格式解析 https://blog.csdn.net/dream_dt/article/details/79215798

word 轉 html https://www.cnblogs.com/jameslif/p/3356588.html


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM