package com.szy.project.utils; import java.io.BufferedInputStream; import java.io.BufferedOutputStream; import java.io.BufferedReader; import java.io.DataOutputStream; import java.io.File; import java.io.FileInputStream; import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.io.OutputStreamWriter; import java.io.Reader; import java.io.Writer; import java.util.Enumeration; import javax.mail.MessagingException; import javax.mail.Multipart; import javax.mail.Session; import javax.mail.internet.MimeBodyPart; import javax.mail.internet.MimeMessage; import javax.mail.internet.MimeMultipart; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; /** * 轉換工具 ---------- 需要引入第三方依賴 javaMail轉換格式 和 jsoup解析HTML * jsoup 文檔地址 :http://www.open-open.com/jsoup/parse-document-from-string.htm * 將mht 轉化成 HTML * @author 隔壁老王 * */ public class Mht2HtmlUtil { public static void main(String[] args) throws IOException { /** * 轉換 */ //mht2html("f:\\job_111.mht", "f:\\test.htm"); /** * 獲取姓名和性別 */ String nameAndSex = Mht2HtmlUtil.findResultValue("f:\\test.htm", "li", "info_name"); String tmpString = nameAndSex.replaceAll("(?i)[^a-zA-Z0-9\u4E00-\u9FA5]", "");// 去掉所有中英文符號 char[] carr = tmpString.toCharArray(); for (int i = 0; i < tmpString.length(); i++) { if (carr[i] < 0xFF) { carr[i] = ' ';// 過濾掉非漢字內容 } } System.out.println(tmpString.substring(0, tmpString.length()-1)); //姓名 System.out.println(tmpString.substring(tmpString.length()-1)); //性別 /** * 獲取教育經歷 */ File htmlf=new File("f:\\test.htm"); Document doc=Jsoup.parse(htmlf, "UTF-8"); String ss=doc.body().toString(); //class等於masthead的li標簽 Object[] aa= doc.select("div.detaile_box").toArray(); for (int i = 0; i < aa.length; i++) { if(i==3){ String strtext = aa[i].toString(); Document docs = Jsoup.parse(strtext); Object[] bb= docs.select("b.edu_main_sch").toArray(); for (int j = 0; j < bb.length; j++) { String tt = bb[j].toString(); Document doct = Jsoup.parse(tt); String result = doct.select("b.edu_main_sch").text(); String a=result.substring(0, result.indexOf("|")).trim(); String b=result.substring(result.lastIndexOf("|")+1, result.length()).trim(); System.out.println(a+" "+b); //畢業院校加學歷 } } } } /** * 解析標簽 獲取標簽值 * @param htmlFilePath 文件路徑 * @param lableName 標簽名稱 * @param onClassName 標簽名稱 * @return * @throws IOException */ public static String findResultValue(String htmlFilePath , String lableName , String onClassName) throws IOException{ File htmlf=new File(htmlFilePath); Document doc=Jsoup.parse(htmlf, "UTF-8"); String bodyText=doc.body().toString(); // 獲取文件文本信息 //class等於onClassName的lableName標簽 String resultValue = doc.select(lableName+"."+onClassName).first().text(); return resultValue; } /** * 解析標簽結果返回多個值 * @param htmlFilePath 文件路徑 * @param lableName 標簽名稱 * @param onClassName 標簽名稱 * @return * @throws IOException */ public static Object[] findResultValueToArray (String htmlFilePath , String lableName , String onClassName) throws IOException{ File htmlf=new File(htmlFilePath); Document doc=Jsoup.parse(htmlf, "UTF-8"); String bodyText=doc.body().toString(); // 獲取文件文本信息 return doc.select(lableName+"."+onClassName).toArray(); } /** * 將 mht文件轉換成 html文件 * * @param s_SrcMht // mht 文件的位置 * @param s_DescHtml // 轉換后輸出的HTML的位置 */ public static void mht2html(String srcMht, String descHtml) { try { InputStream fis = new FileInputStream(srcMht); Session mailSession = Session.getDefaultInstance( System.getProperties(), null); MimeMessage msg = new MimeMessage(mailSession, fis); Object content = msg.getContent(); if (content instanceof Multipart) { MimeMultipart mp = (MimeMultipart) content; MimeBodyPart bp1 = (MimeBodyPart) mp.getBodyPart(0); // 獲取mht文件內容代碼的編碼 String strEncodng = getEncoding(bp1); // 獲取mht文件的內容 String strText = getHtmlText(bp1, strEncodng); if (strText == null) return; /** * 創建以mht文件名稱的文件夾,主要用來保存資源文件。 這里不需要所以注釋掉了 */ /* File parent = null; if (mp.getCount() > 1) { parent = new File(new File(descHtml).getAbsolutePath() + ".files"); parent.mkdirs(); if (!parent.exists()) { // 創建文件夾失敗的話則退出 return; } }*/ /** * FOR中代碼 主要是保存資源文件及替換路徑 這里不需要所以注釋掉了 */ /* for (int i = 1; i < mp.getCount(); ++i) { MimeBodyPart bp = (MimeBodyPart) mp.getBodyPart(i); // 獲取資源文件的路徑 // 例(獲取: http://xxx.com/abc.jpg) String strUrl = getResourcesUrl(bp); if (strUrl == null || strUrl.length() == 0) continue; DataHandler dataHandler = bp.getDataHandler(); MimePartDataSource source = (MimePartDataSource) dataHandler .getDataSource(); // 獲取資源文件的絕對路徑 String FilePath = parent.getAbsolutePath() + File.separator + getName(strUrl, i); File resources = new File(FilePath); // 保存資源文件 if (SaveResourcesFile(resources, bp.getInputStream())) { // 將遠程地址替換為本地地址 如圖片、JS、CSS樣式等等 strText = strText.replace(strUrl, resources.getAbsolutePath()); } }*/ // 最后保存HTML文件 SaveHtml(strText, descHtml, strEncodng); } } catch (Exception e) { e.printStackTrace(); } } /** * 獲取mht文件內容中資源文件的名稱 * * @param strName * @param ID * @return */ public static String getName(String strName, int ID) { char separator1 = '/'; char separator2 = '\\'; // 將換行替換 strName = strName.replaceAll("\r\n", ""); // 獲取文件名稱 if (strName.lastIndexOf(separator1) >= 0) { return strName.substring(strName.lastIndexOf(separator1) + 1); } if (strName.lastIndexOf(separator2) >= 0) { return strName.substring(strName.lastIndexOf(separator2) + 1); } return ""; } /** * 將提取出來的html內容寫入保存的路徑中。 * * @param strText * @param strHtml * @param strEncodng */ public static boolean SaveHtml(String s_HtmlTxt, String s_HtmlPath, String s_Encode) { try { Writer out = null; out = new OutputStreamWriter( new FileOutputStream(s_HtmlPath, false), s_Encode); out.write(s_HtmlTxt); out.close(); } catch (Exception e) { return false; } return true; } /** * 保存網頁中的JS、圖片、CSS樣式等資源文件 * * @param SrcFile * 源文件 * @param inputStream * 輸入流 * @return */ private static boolean SaveResourcesFile(File SrcFile, InputStream inputStream) { if (SrcFile == null || inputStream == null) { return false; } BufferedInputStream in = null; FileOutputStream fio = null; BufferedOutputStream osw = null; try { in = new BufferedInputStream(inputStream); fio = new FileOutputStream(SrcFile); osw = new BufferedOutputStream(new DataOutputStream(fio)); int index = 0; byte[] a = new byte[1024]; while ((index = in.read(a)) != -1) { osw.write(a, 0, index); } osw.flush(); return true; } catch (Exception e) { e.printStackTrace(); return false; } finally { try { if (osw != null) osw.close(); if (fio != null) fio.close(); if (in != null) in.close(); if (inputStream != null) inputStream.close(); } catch (Exception e) { e.printStackTrace(); return false; } } } /** * 獲取mht文件里資源文件的URL路徑 * * @param bp * @return */ private static String getResourcesUrl(MimeBodyPart bp) { if (bp == null) { return null; } try { Enumeration list = bp.getAllHeaders(); while (list.hasMoreElements()) { javax.mail.Header head = (javax.mail.Header) list.nextElement(); if (head.getName().compareTo("Content-Location") == 0) { return head.getValue(); } } return null; } catch (MessagingException e) { return null; } } /** * 獲取mht文件中的內容代碼 * * @param bp * @param strEncoding * 該mht文件的編碼 * @return */ private static String getHtmlText(MimeBodyPart bp, String strEncoding) { InputStream textStream = null; BufferedInputStream buff = null; BufferedReader br = null; Reader r = null; try { textStream = bp.getInputStream(); buff = new BufferedInputStream(textStream); r = new InputStreamReader(buff, strEncoding); br = new BufferedReader(r); StringBuffer strHtml = new StringBuffer(""); String strLine = null; while ((strLine = br.readLine()) != null) { System.out.println(strLine); strHtml.append(strLine + "\r\n"); } br.close(); r.close(); textStream.close(); return strHtml.toString(); } catch (Exception e) { e.printStackTrace(); } finally { try { if (br != null) br.close(); if (buff != null) buff.close(); if (textStream != null) textStream.close(); } catch (Exception e) { } } return null; } /** * 獲取mht網頁文件中內容代碼的編碼 * * @param bp * @return */ private static String getEncoding(MimeBodyPart bp) { if (bp == null) { return null; } try { Enumeration list = bp.getAllHeaders(); while (list.hasMoreElements()) { javax.mail.Header head = (javax.mail.Header) list.nextElement(); if (head.getName().equalsIgnoreCase("Content-Type")) { String strType = head.getValue(); int pos = strType.indexOf("charset="); if (pos >= 0) { String strEncoding = strType.substring(pos + 8, strType.length()); if (strEncoding.startsWith("\"") || strEncoding.startsWith("\'")) { strEncoding = strEncoding.substring(1, strEncoding.length()); } if (strEncoding.endsWith("\"") || strEncoding.endsWith("\'")) { strEncoding = strEncoding.substring(0, strEncoding.length() - 1); } if (strEncoding.toLowerCase().compareTo("gb2312") == 0) { strEncoding = "gbk"; } return strEncoding; } } } } catch (MessagingException e) { e.printStackTrace(); } return null; } /** * 刪除指定文件 * @param filePath 文件路徑 * @param fileName 文件名稱 * @param layout 文件格式 */ public static void deleteFileName(String filePath , String fileName , String layout){ File folder = new File(filePath); String fileNameOnLayout=fileName+"."+layout; File[] files = folder.listFiles(); //獲取該文件夾下的所有文件 for(File file:files){ if(file.getName().equals(fileNameOnLayout)){ file.delete(); } } } }
工具所用到的第三方依賴:
<!-- https://mvnrepository.com/artifact/javax.mail/mail --> <dependency> <groupId>javax.mail</groupId> <artifactId>mail</artifactId> <version>1.4.7</version> </dependency> <!-- https://mvnrepository.com/artifact/org.jsoup/jsoup --> <dependency> <groupId>org.jsoup</groupId> <artifactId>jsoup</artifactId> <version>1.10.1</version> </dependency>