前言
不知不覺已經寫了104篇隨筆了,為了避免發生意外造成博客丟失,我們寫一個備份腳本對博客進行備份
1、備份格式我們選擇md文檔格式
2、圖片要下載到本地,方便我們統一上傳圖床
3、博客數據入庫,可以用mysql
我們選用Java爬蟲神器:HtmlUnit(詳情請戳:htmlUnit加持,網絡小蜘蛛的超級進化)進行文章的采集、html結構分析
博客結構分析
我們的文章結構還是挺簡單的,主要就以下幾個內容
0、文章標題,id=cb_post_title_url的a標簽
1、文章內容,id=cnblogs_post_body的div標簽
1】、二、三級標題,h2、h3標簽
2】、代碼,div標簽
3】、表格,table標簽
4】、其他,其他內容被p標簽包成一行,按p標簽內容可分成以下幾種:
1)、圖片,p標簽內容里有img標簽
2)、鏈接,p標簽內容里有a標簽
3)、標紅文字,p標簽內容里有span標簽
4)、普通文字
對應md文檔格式

代碼編寫
首先需要引入pom依賴、以及建表
<!-- htmlunit --> <dependency> <groupId>net.sourceforge.htmlunit</groupId> <artifactId>htmlunit</artifactId> <version>2.53.0</version> </dependency> <!-- mysql 驅動 --> <dependency> <groupId>mysql</groupId> <artifactId>mysql-connector-java</artifactId> <version>5.1.44</version> </dependency> <!-- hutool-all --> <dependency> <groupId>cn.hutool</groupId> <artifactId>hutool-all</artifactId> <version>5.7.4</version> </dependency>
CREATE TABLE `cnblogs` ( `id` int(4) NOT NULL COMMENT '表主鍵', `title` varchar(100) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL COMMENT '標題', `content` mediumtext CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci NULL COMMENT '內容(md文檔格式)', `date` datetime NULL DEFAULT NULL COMMENT '發布時間', `view_count` int(7) NULL DEFAULT NULL COMMENT '閱讀數', `comment_count` int(3) NULL DEFAULT NULL COMMENT '評論數', PRIMARY KEY (`id`) USING BTREE ) ENGINE = InnoDB CHARACTER SET = utf8 COLLATE = utf8_general_ci COMMENT = '博客園博客備份表' ROW_FORMAT = Compact;
根據博客主頁,抓取全部博客地址
/** * 獲取所有博客地址 */ private static ArrayList<String> getUrls(WebClient webClient, String url,int pageNumber) throws IOException, InterruptedException { ArrayList<String> arrayList = new ArrayList<>(10); //發起請求 HtmlPage page = webClient.getPage(url + pageNumber); //獲取URL for (DomNode domNode : page.querySelectorAll("div.postTitle")) { arrayList.add(domNode.querySelector("a.postTitle2").getAttributes().getNamedItem("href").getTextContent()); } //下一頁 if(page.querySelector("div.topicListFooter").asNormalizedText().contains("下一頁")){ //隨機休眠 Thread.sleep(RandomUtil.randomInt(1000, 2000)); ArrayList<String> urls = getUrls(webClient,url,pageNumber+1); arrayList.addAll(urls); } return arrayList; }
調用
//獲取所有博客鏈接 ArrayList<String> arrayList = getUrls(webClient, url + "/default.html?page=", 1);
根據URL獲取博客內容,解析html轉成md文檔格式
/** * 根據URL獲取博客內容,解析html轉成md文檔格式 * 1、下載保存圖片 * 2、保存md文檔 * 3、返回博客信息,如果需要可以存庫 */ private static Map<Object, Object> task(WebClient webClient,String url) throws IOException, InterruptedException { //發起請求 HtmlPage page = webClient.getPage(url); //發布時間 DomNode postDate = page.querySelector("span#post-date"); String date = postDate.asNormalizedText(); //閱讀數 DomNode postViewCount = page.querySelector("span#post_view_count"); String viewCount = postViewCount.asNormalizedText(); //評論數 DomNode postCommentCount = page.querySelector("span#post_comment_count"); String commentCount = postCommentCount.asNormalizedText(); //標題 DomNode postTitle = page.querySelector("a#cb_post_title_url"); String titleName = postTitle.asNormalizedText(); //內容 StringBuilder stringBuilder = new StringBuilder(); DomNodeList<DomNode> childNodes = page.querySelector("div#cnblogs_post_body").getChildNodes(); DomNode[] array = new DomNode[childNodes.size()]; array = childNodes.toArray(array); List<DomNode> psParamList = new ArrayList<>(Arrays.asList(array)); for (int i = 0; i < psParamList.size(); i++) { DomNode childNode = psParamList.get(i); //<div class="para"> Node aClass = childNode.getAttributes().getNamedItem("class"); if("div".equals(childNode.getNodeName()) && aClass != null && "para".equals(aClass.getTextContent())){ psParamList.addAll(i,childNode.getChildNodes()); psParamList.remove(childNode); i--; continue; } //h2,二級標題 if("h2".equals(childNode.getNodeName())){ String text = childNode.asNormalizedText(); if(!"".equals(text.trim().replaceAll(" ",""))){ stringBuilder.append("## ").append(text).append(" <br/>\n"); } continue; } //h3,三級標題 if("h3".equals(childNode.getNodeName())){ String text = childNode.asNormalizedText(); if(!"".equals(text.trim().replaceAll(" ",""))){ stringBuilder.append("### ").append(text).append(" <br/>\n"); } continue; } //div,代碼內容 if("div".equals(childNode.getNodeName()) && aClass != null && "cnblogs_code".equals(aClass.getTextContent())){ stringBuilder.append("```").append("\n"); stringBuilder.append(childNode.asNormalizedText()).append("\n"); stringBuilder.append("```").append("\n"); continue; } //table,表格內容 if("table".equals(childNode.getNodeName())){ DomNodeList<DomNode> trDomNodes = childNode.querySelectorAll("tr"); stringBuilder.append("\n"); for (int j = 0; j < trDomNodes.size(); j++) { DomNode trDomNode = trDomNodes.get(j); DomNodeList<DomNode> tdDomNodes = trDomNode.querySelectorAll("td"); for (DomNode tdDomNode : tdDomNodes) { stringBuilder.append("|").append(tdDomNode.asNormalizedText()); } stringBuilder.append("|\n"); //標題 if(j == 0){ for (int k = 0; k < tdDomNodes.size(); k++) { stringBuilder.append("|:----:"); } stringBuilder.append("|\n"); } } stringBuilder.append(" <br/>\n"); continue; } //文本內容 //圖片 if(childNode.asXml().contains("<img")){ DomNodeList<DomNode> imgDomNodes = childNode.querySelectorAll("img"); DomNode[] array1 = new DomNode[imgDomNodes.size()]; array1 = imgDomNodes.toArray(array1); List<DomNode> psParamList1 = new ArrayList<>(Arrays.asList(array1)); if(psParamList1.size() <= 0){ psParamList1.add(childNode); } for (DomNode imgDomNode : psParamList1) { Node srcItem = imgDomNode.getAttributes().getNamedItem("src"); if(srcItem == null){ break; } //得到圖片網絡地址 String src = srcItem.getTextContent(); //將文件下載后保存 String[] split = src.split("/1353055/"); //圖片保存路徑,隨機休眠1-2秒,重要:先下載到本地,再上傳到圖床 if(isDownloadImg){ File file = new File("F:/cnblogs/blog-image/" + split[1]); if(!file.exists()){ Thread.sleep(RandomUtil.randomInt(1000, 2000)); HttpUtil.downloadFile(src, file); } //寫入新路徑 stringBuilder.append(".append(imgPath).append(split[1]).append(")").append(" "); }else{ //寫入src路徑 stringBuilder.append(".append(src).append(")").append(" "); } } stringBuilder.append(" <br/>\n"); } //標注字體顏色 else if(childNode.getLastChild() != null && "span".equals(childNode.getLastChild().getNodeName())){ DomNode span = childNode.getLastChild(); stringBuilder.append(" ").append(span.asXml().replaceAll("\r","").replaceAll("\n","")).append(" <br/>\n"); } //包含a標簽 else if(childNode.asXml().contains("</a>")){ String newPText = childNode.asNormalizedText(); DomNodeList<DomNode> aDomNodes = childNode.querySelectorAll("a"); for (DomNode aDomNode : aDomNodes) { String text = aDomNode.asNormalizedText(); String href = aDomNode.getAttributes().getNamedItem("href").getTextContent(); String newStr = "["+text+"]("+href+")"; newPText = newPText.replace(text,newStr); } //替換 stringBuilder.append(newPText).append(" <br/>\n"); } //普通文字 else{ String pText = childNode.asNormalizedText(); if(StrUtil.isBlankIfStr(pText)){ stringBuilder.append("\n"); }else{ //四個空格轉換 stringBuilder.append(pText.replaceFirst(" "," ")).append(" <br/>\n"); } } } //生成md文檔(文件名不能包含特殊字符:\,/,:,*,?,",<,>,|) String titleNameFileName = titleName .replaceAll("\\\\","_") .replaceAll("/","_") .replaceAll(":","_") .replaceAll("\\*","_") .replaceAll("\\?","_") .replaceAll("\"","_") .replaceAll("<","_") .replaceAll(">","_") .replaceAll("\\|","_") ; FileUtil.fileWriter(FileUtil.createFile("F:\\cnblogs\\《"+ titleNameFileName +"》.md"),stringBuilder); System.out.println("《"+titleName+"》備份完成!"); return MapUtil.builder() .put("title", titleName) .put("content", stringBuilder.toString()) .put("date", date) .put("view_count", viewCount) .put("comment_count", commentCount) .build(); }
然后再循環遍歷所有博客鏈接
for (int i = 0; i < arrayList.size(); i++) { Map<Object, Object> paramMap =task(webClient,arrayList.get(i)); //新增入庫 if(isPutDataBase){ paramMap.put("id",i+1); db.execute("insert into cnblogs values (:id, :title, :content, :date, :view_count, :comment_count)",paramMap); } //隨機休眠5-10秒 Thread.sleep(RandomUtil.randomInt(5000, 10000)); }
下載圖片用的是hutool庫的HttpUtil.downloadFile方法
//得到圖片網絡地址 String src = srcItem.getTextContent(); //將文件下載后保存 String[] split = src.split("/1353055/"); //圖片保存路徑,隨機休眠1-2秒,重要:先下載到本地,再上傳到圖床 if(isDownloadImg){ File file = new File("F:/cnblogs/blog-image/" + split[1]); if(!file.exists()){ Thread.sleep(RandomUtil.randomInt(1000, 2000)); HttpUtil.downloadFile(src, file); } //寫入新路徑 stringBuilder.append(".append(imgPath).append(split[1]).append(")").append(" "); }else{ //寫入src路徑 stringBuilder.append(".append(src).append(")").append(" "); }
數據入庫,用的是hutool庫封裝的jdbc方法
public static void main(String[] args) { //hutool工具類,使用jdbc進行查詢 try { // Oracle //SimpleDataSource ds = new SimpleDataSource("jdbc:oracle:thin:@localhost:1521:orcl", "test", "test"); // MySQL SimpleDataSource ds = new SimpleDataSource("jdbc:mysql://localhost/jfinal_demo", "root", "123456"); //使用簡單 Db use = Db.use(ds); //查詢 List<Entity> result = use.query("select * from user where name like ?","%張三%"); for (Entity entity : result) { System.out.println(JSONUtil.toJsonStr(entity)); } //更新 Map<Object, Object> paramMap = MapUtil.builder() .put("name", "張三") .put("id", 3) .build(); use.execute("update user set name = :name where id = :id", paramMap); //關閉連接 use.closeConnection(use.getConnection()); } catch (SQLException e) { e.printStackTrace(); } //hutool工具類,使用jdbc進行查詢 try { // Oracle //SimpleDataSource ds = new SimpleDataSource("jdbc:oracle:thin:@localhost:1521:orcl", "test", "test"); // MySQL SimpleDataSource ds = new SimpleDataSource("jdbc:mysql://localhost/jfinal_demo", "root", "123456"); //帶事務 Session session = Session.create(ds); List<Entity> result1 = session.query("select * from user where name like ?", "%張三%"); for (Entity entity : result1) { System.out.println(JSONUtil.toJsonStr(entity)); } for (int i = 0; i < 10; i++) { session.execute("update user set name = ? where id = ?","name"+i,i); } session.commit(); //關閉連接 session.closeConnection(session.getConnection()); } catch (SQLException e) { e.printStackTrace(); } }
效果演示
直接運行main入口函數






后記
pom引入依賴
<!-- htmlunit -->
<dependency>
<groupId>net.sourceforge.htmlunit</groupId>
<artifactId>htmlunit</artifactId>
<version>2.53.0</version>
</dependency>
<!-- mysql 驅動 -->
<dependency>
<groupId>mysql</groupId>
<artifactId>mysql-connector-java</artifactId>
<version>5.1.44</version>
</dependency>
<!-- hutool-all -->
<dependency>
<groupId>cn.hutool</groupId>
<artifactId>hutool-all</artifactId>
<version>5.7.4</version>
</dependency>
建表sql語句
CREATE TABLE `cnblogs` ( `id` int(4) NOT NULL COMMENT '表主鍵', `title` varchar(100) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL COMMENT '標題', `content` mediumtext CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci NULL COMMENT '內容(md文檔格式)', `date` datetime NULL DEFAULT NULL COMMENT '發布時間', `view_count` int(7) NULL DEFAULT NULL COMMENT '閱讀數', `comment_count` int(3) NULL DEFAULT NULL COMMENT '評論數', PRIMARY KEY (`id`) USING BTREE ) ENGINE = InnoDB CHARACTER SET = utf8 COLLATE = utf8_general_ci COMMENT = '博客園博客備份表' ROW_FORMAT = Compact;
完整Java腳本代碼
import cn.hutool.core.date.DateUtil; import cn.hutool.core.date.TimeInterval; import cn.hutool.core.map.MapUtil; import cn.hutool.core.util.RandomUtil; import cn.hutool.core.util.StrUtil; import cn.hutool.db.Db; import cn.hutool.db.ds.simple.SimpleDataSource; import cn.hutool.http.HttpUtil; import com.gargoylesoftware.htmlunit.BrowserVersion; import com.gargoylesoftware.htmlunit.WebClient; import com.gargoylesoftware.htmlunit.html.DomNode; import com.gargoylesoftware.htmlunit.html.DomNodeList; import com.gargoylesoftware.htmlunit.html.HtmlPage; import org.w3c.dom.Node; import java.io.File; import java.io.FileWriter; import java.io.IOException; import java.io.PrintWriter; import java.util.ArrayList; import java.util.Arrays; import java.util.List; import java.util.Map; /** * 博客園隨筆備份Java腳本 * * 需要提前設置博客主頁地址(用於獲取全部博客地址)、圖床路徑(生成的md文檔中,圖片的路徑將會替換成我們的圖床路徑) * 匹配規則不一樣全部適用,需要對task方法進行針對性調整 */ /* 需要引入依賴 <!-- htmlunit 2.53.0 --> <dependency> <groupId>net.sourceforge.htmlunit</groupId> <artifactId>htmlunit</artifactId> <version>2.53.0</version> </dependency> <!-- mysql 驅動 --> <dependency> <groupId>mysql</groupId> <artifactId>mysql-connector-java</artifactId> <version>5.1.44</version> </dependency> <!-- oracle 驅動 --> <!--<dependency> <groupId>com.oracle</groupId> <artifactId>ojdbc6</artifactId> <version>11.2.0.3</version> </dependency>--> <!-- hutool-all --> <dependency> <groupId>cn.hutool</groupId> <artifactId>hutool-all</artifactId> <version>5.7.4</version> </dependency> 建表SQL語句 CREATE TABLE `cnblogs` ( `id` int(4) NOT NULL COMMENT '表主鍵', `title` varchar(100) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL COMMENT '標題', `content` mediumtext CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci NULL COMMENT '內容(md文檔格式)', `date` datetime NULL DEFAULT NULL COMMENT '發布時間', `view_count` int(7) NULL DEFAULT NULL COMMENT '閱讀數', `comment_count` int(3) NULL DEFAULT NULL COMMENT '評論數', PRIMARY KEY (`id`) USING BTREE ) ENGINE = InnoDB CHARACTER SET = utf8 COLLATE = utf8_general_ci COMMENT = '博客園博客備份表' ROW_FORMAT = Compact; */ public class cnblogs { /** * 可以先測試一篇博客 * urlByTest不為空則僅進行備份測試,為空則進行正式備份 */ private static String urlByTest = null; static{ // urlByTest = "https://www.cnblogs.com/huanzi-qch/p/9930390.html"; } /** * 博客主頁地址 */ private static String url = "https://www.cnblogs.com/huanzi-qch"; /** * 圖床地址 * PS:如果下載圖片到本地,則需要配置圖床地址 */ private static String imgPath = ""; /** * 是否下載圖片到本地 */ private static boolean isDownloadImg = false; /** * 是否入庫 */ private static boolean isPutDataBase = false; /** * main入口函數 */ public static void main(String[] args) { //創建一個WebClient,並模擬特定的瀏覽器 try (WebClient webClient = new WebClient(BrowserVersion.FIREFOX_78)) { webClient.getOptions().setJavaScriptEnabled(false);//禁用js webClient.getOptions().setCssEnabled(false);//禁用css webClient.getOptions().setTimeout(10000); //設置連接超時時間 // hutool工具類,使用jdbc進行操作 SimpleDataSource ds; Db db = null; if(isPutDataBase){ ds = new SimpleDataSource("jdbc:mysql://localhost/jfinal_demo", "root", "123456"); // ds = new SimpleDataSource("jdbc:oracle:thin:@localhost:1521:orcl", "test", "test"); db = Db.use(ds); } //測試備份一篇 if(urlByTest != null){ Map<Object, Object> paramMap =task(webClient,urlByTest); //新增入庫 if(isPutDataBase){ paramMap.put("id",RandomUtil.randomInt(1, 10000)); db.execute("insert into cnblogs values (:id, :title, :content, :date, :view_count, :comment_count)",paramMap); } System.out.println("測試備份完成!"); return; } //計時器 TimeInterval timer = DateUtil.timer(); System.out.println("開始備份,正在獲取所有博客地址..."); //獲取所有博客鏈接 ArrayList<String> arrayList = getUrls(webClient, url + "/default.html?page=", 1); System.out.println("獲取所有博客地址成功,共有"+arrayList.size()+"篇博客"); //清空表 if(isPutDataBase) { db.execute("truncate table cnblogs"); } for (int i = 0; i < arrayList.size(); i++) { Map<Object, Object> paramMap =task(webClient,arrayList.get(i)); //新增入庫 if(isPutDataBase){ paramMap.put("id",i+1); db.execute("insert into cnblogs values (:id, :title, :content, :date, :view_count, :comment_count)",paramMap); } //隨機休眠 Thread.sleep(RandomUtil.randomInt(1000, 2000)); } System.out.println(arrayList.size()+"篇博客備份全部完成!耗時:"+(timer.intervalMinute()) + "分鍾"); }catch (Exception e){ System.err.println("備份出錯!!"); e.printStackTrace(); } } /** * file工具類 */ private static class FileUtil { /** * 創建文件 * * @param pathNameAndFileName 路徑跟文件名 * @return File對象 */ private static File createFile(String pathNameAndFileName) { File file = new File(pathNameAndFileName); try { //獲取父目錄 File fileParent = file.getParentFile(); if (!fileParent.exists()) { fileParent.mkdirs(); } //創建文件 if (!file.exists()) { file.createNewFile(); } } catch (Exception e) { file = null; System.err.println("新建文件操作出錯"); e.printStackTrace(); } return file; } /** * 字符流寫入文件 * * @param file file對象 * @param stringBuilder 要寫入的數據 */ private static void fileWriter(File file, StringBuilder stringBuilder) { //字符流 try { FileWriter resultFile = new FileWriter(file, false);//true,則追加寫入 false,則覆蓋寫入 PrintWriter myFile = new PrintWriter(resultFile); //寫入 myFile.println(stringBuilder.toString()); myFile.close(); resultFile.close(); } catch (Exception e) { System.err.println("寫入操作出錯"); e.printStackTrace(); } } } /** * 根據URL獲取博客內容,解析html轉成md文檔格式 * 1、下載保存圖片 * 2、保存md文檔 * 3、返回博客信息,如果需要可以存庫 */ private static Map<Object, Object> task(WebClient webClient,String url) throws IOException, InterruptedException { //發起請求 HtmlPage page = webClient.getPage(url); //發布時間 DomNode postDate = page.querySelector("span#post-date"); String date = postDate.asNormalizedText(); //閱讀數 DomNode postViewCount = page.querySelector("span#post_view_count"); String viewCount = postViewCount.asNormalizedText(); //評論數 DomNode postCommentCount = page.querySelector("span#post_comment_count"); String commentCount = postCommentCount.asNormalizedText(); //標題 DomNode postTitle = page.querySelector("a#cb_post_title_url"); String titleName = postTitle.asNormalizedText(); //內容 StringBuilder stringBuilder = new StringBuilder(); DomNodeList<DomNode> childNodes = page.querySelector("div#cnblogs_post_body").getChildNodes(); DomNode[] array = new DomNode[childNodes.size()]; array = childNodes.toArray(array); List<DomNode> psParamList = new ArrayList<>(Arrays.asList(array)); for (int i = 0; i < psParamList.size(); i++) { DomNode childNode = psParamList.get(i); //<div class="para"> Node aClass = childNode.getAttributes().getNamedItem("class"); if("div".equals(childNode.getNodeName()) && aClass != null && "para".equals(aClass.getTextContent())){ psParamList.addAll(i,childNode.getChildNodes()); psParamList.remove(childNode); i--; continue; } //h2,二級標題 if("h2".equals(childNode.getNodeName())){ String text = childNode.asNormalizedText(); if(!"".equals(text.trim().replaceAll(" ",""))){ stringBuilder.append("## ").append(text).append(" <br/>\n"); } continue; } //h3,三級標題 if("h3".equals(childNode.getNodeName())){ String text = childNode.asNormalizedText(); if(!"".equals(text.trim().replaceAll(" ",""))){ stringBuilder.append("### ").append(text).append(" <br/>\n"); } continue; } //div,代碼內容 if("div".equals(childNode.getNodeName()) && aClass != null && "cnblogs_code".equals(aClass.getTextContent())){ stringBuilder.append("```").append("\n"); stringBuilder.append(childNode.asNormalizedText()).append("\n"); stringBuilder.append("```").append("\n"); continue; } //table,表格內容 if("table".equals(childNode.getNodeName())){ DomNodeList<DomNode> trDomNodes = childNode.querySelectorAll("tr"); stringBuilder.append("\n"); for (int j = 0; j < trDomNodes.size(); j++) { DomNode trDomNode = trDomNodes.get(j); DomNodeList<DomNode> tdDomNodes = trDomNode.querySelectorAll("td"); for (DomNode tdDomNode : tdDomNodes) { stringBuilder.append("|").append(tdDomNode.asNormalizedText()); } stringBuilder.append("|\n"); //標題 if(j == 0){ for (int k = 0; k < tdDomNodes.size(); k++) { stringBuilder.append("|:----:"); } stringBuilder.append("|\n"); } } stringBuilder.append(" <br/>\n"); continue; } //文本內容 //圖片 if(childNode.asXml().contains("<img")){ DomNodeList<DomNode> imgDomNodes = childNode.querySelectorAll("img"); DomNode[] array1 = new DomNode[imgDomNodes.size()]; array1 = imgDomNodes.toArray(array1); List<DomNode> psParamList1 = new ArrayList<>(Arrays.asList(array1)); if(psParamList1.size() <= 0){ psParamList1.add(childNode); } for (DomNode imgDomNode : psParamList1) { Node srcItem = imgDomNode.getAttributes().getNamedItem("src"); if(srcItem == null){ break; } //得到圖片網絡地址 String src = srcItem.getTextContent(); //將文件下載后保存 String[] split = src.split("/1353055/"); //圖片保存路徑,隨機休眠1-2秒,重要:先下載到本地,再上傳到圖床 if(isDownloadImg){ File file = new File("F:/cnblogs/blog-image/" + split[1]); if(!file.exists()){ Thread.sleep(RandomUtil.randomInt(1000, 2000)); HttpUtil.downloadFile(src, file); } //寫入新路徑 stringBuilder.append(".append(imgPath).append(split[1]).append(")").append(" "); }else{ //寫入src路徑 stringBuilder.append(".append(src).append(")").append(" "); } } stringBuilder.append(" <br/>\n"); } //標注字體顏色 else if(childNode.getLastChild() != null && "span".equals(childNode.getLastChild().getNodeName())){ DomNode span = childNode.getLastChild(); stringBuilder.append(" ").append(span.asXml().replaceAll("\r","").replaceAll("\n","")).append(" <br/>\n"); } //包含a標簽 else if(childNode.asXml().contains("</a>")){ String newPText = childNode.asNormalizedText(); DomNodeList<DomNode> aDomNodes = childNode.querySelectorAll("a"); for (DomNode aDomNode : aDomNodes) { String text = aDomNode.asNormalizedText(); String href = aDomNode.getAttributes().getNamedItem("href").getTextContent(); String newStr = "["+text+"]("+href+")"; newPText = newPText.replace(text,newStr); } //替換 stringBuilder.append(newPText).append(" <br/>\n"); } //普通文字 else{ String pText = childNode.asNormalizedText(); if(StrUtil.isBlankIfStr(pText)){ stringBuilder.append("\n"); }else{ //四個空格轉換 stringBuilder.append(pText.replaceFirst(" "," ")).append(" <br/>\n"); } } } //生成md文檔(文件名不能包含特殊字符:\,/,:,*,?,",<,>,|) String titleNameFileName = titleName .replaceAll("\\\\","_") .replaceAll("/","_") .replaceAll(":","_") .replaceAll("\\*","_") .replaceAll("\\?","_") .replaceAll("\"","_") .replaceAll("<","_") .replaceAll(">","_") .replaceAll("\\|","_") ; FileUtil.fileWriter(FileUtil.createFile("F:\\cnblogs\\《"+ titleNameFileName +"》.md"),stringBuilder); System.out.println("《"+titleName+"》備份完成!"); return MapUtil.builder() .put("title", titleName) .put("content", stringBuilder.toString()) .put("date", date) .put("view_count", viewCount) .put("comment_count", commentCount) .build(); } /** * 獲取所有博客地址 */ private static ArrayList<String> getUrls(WebClient webClient, String url,int pageNumber) throws IOException, InterruptedException { ArrayList<String> arrayList = new ArrayList<>(10); //發起請求 HtmlPage page = webClient.getPage(url + pageNumber); //獲取URL for (DomNode domNode : page.querySelectorAll("div.postTitle")) { arrayList.add(domNode.querySelector("a.postTitle2").getAttributes().getNamedItem("href").getTextContent()); } //下一頁 if(page.querySelector("div.topicListFooter").asNormalizedText().contains("下一頁")){ //隨機休眠 Thread.sleep(RandomUtil.randomInt(1000, 2000)); ArrayList<String> urls = getUrls(webClient,url,pageNumber+1); arrayList.addAll(urls); } return arrayList; } }
幾點注意事項:
0、Java備份腳本使用要遵紀守法,備份下來的博客數據僅用於數據備份,備份設置5-10秒間隙,不要影響到網站的正常使用
1、文件名不能有特殊字符
2、數據庫表的內容字段,字符集要utf8mb4,類型要mediumtext


代碼開源
代碼已經開源、托管到我的GitHub、碼雲:
