html轉換text-分段落,實現富文本導入word的格式轉換,標簽過濾
一、工具類 html2Text
import javax.swing.text.html.HTMLEditorKit;
import javax.swing.text.html.parser.ParserDelegator;
import java.io.*;
public class Html2Text extends HTMLEditorKit.ParserCallback {
private static Html2Text html2Text = new Html2Text();
StringBuffer s;
public Html2Text() {
}
public void parse(String str) throws IOException {
InputStream iin = new ByteArrayInputStream(str.getBytes());
Reader in = new InputStreamReader(iin);
s = new StringBuffer();
ParserDelegator delegator = new ParserDelegator();
// the third parameter is TRUE to ignore charset directive
delegator.parse(in, this, Boolean.TRUE);
iin.close();
in.close();
}
public void handleEndOfLineString(String eol) {
}
/**
*按標簽分割過濾后執行
*/
public void handleText(char[] text, int pos) {
s.append(text);
}
public String getText() {
return s.toString();
}
public static String getContent(String str) {
try {
html2Text.parse(str);
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
return html2Text.getText();
}
}
二、分段實現
/**
* @Name :getTextContentP
* @Description :<富文本html轉換text段落>
* @Author :gaogushenling
* @Date :2021/10/23 14:15
* @Version :1.0
* @History :<修改代碼時說明>
* @param :xmlStr
* @return :List<String>
*/
private List<String> getTextContentP(String xmlStr) {
String s = xmlStr.replaceAll("div", "p");
String[] ss = s.split("<p");
List<String> textList = new ArrayList<>();
for (String s1 : ss) {
String s2 = Html2Text.getContent("<p "+s1);
if (StringUtil.isNotEmpty(s2)){
//textList.add(s2.replaceAll("\"(?<=\")(\\\\S+)(?=\")\"",""));
textList.add(s2);
}
}
if (textList.size() == 0) {
textList.add("富文本文件是空的");
}
return textList;
}
調用
List<String> textList = getTextContentP("富文本(html格式)");
番外:如果標簽不正規,以上方式無法正確分割
用下方正則替換 String s2 = Html2Text.getContent("<p "+s1); 即可 正則的意思是匹配 <> 內容
//html2txt
String s2 = ("<p " + s1).replaceAll("<[^>|^<]*>","");
還有一件事,如果你想問為什么用p來截斷:分段落
下邊是最后的結論
private List<String> getTextContentP(String xmlStr) {
List<String> textList = new ArrayList<>();
//html2txt : 帶標簽,則進行格式清除
Pattern p = Pattern.compile("<.*>");
if (StringUtils.isNotEmpty(xmlStr)){
if (p.matcher(xmlStr.trim()).find()){
String s = xmlStr.replaceAll("div", "p");
String[] ss = s.split("<p");
for (String s1 : ss) {
if (StringUtil.isNotEmpty(s1)){
String s2 = ("<p " + s1).replaceAll("<[^>|^<]*>","");
if (StringUtil.isNotEmpty(s2)) {
textList.add(s2);
}
}
}
}
if (textList.size() == 0) {
textList.add(xmlStr);
}
}else {
textList.add("富文本文件是空的");
}
return textList;
}
