初體驗Jsoup

<!-- Maven坐標地址 -->
<dependency>
    <groupId>org.jsoup</groupId>
    <artifactId>jsoup</artifactId>
    <version>1.13.1</version>
</dependency>

我們先來找到博客園的個人首頁做一個簡單的小練習：https://www.cnblogs.com/hanzhe

調用Jsoup的connect靜態函數創建連接，將爬取的目標網站作為參數傳遞過去：

public class Demo {
    public static void main(String[] args) {
        Connection connect = Jsoup.connect("https://www.cnblogs.com/hanzhe");
    }
}

為了防止爬蟲受到限制，這里設置請求頭來模仿瀏覽器客戶端，可以參照請求進行修改，例如：

public class Demo {
    public static void main(String[] args) {
        Connection connect = Jsoup.connect("https://www.cnblogs.com/hanzhe")
                // 我這里只設置了一個，如果爬取遇到問題隨時添加頭信息即可
                .header("user-agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.96 Safari/537.36 Edg/88.0.705.56");
    }
}

然后調用execute方法開始進行爬取，通過body取出爬取到的數據：

public class Demo {
    public static void main(String[] args) throws IOException {
        Connection connect = Jsoup.connect("https://www.cnblogs.com/hanzhe")
                .header("user-agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.96 Safari/537.36 Edg/88.0.705.56");
        String body = connect.execute().body();
        System.out.println(body);
    }
}

可以看到已經爬取到了首頁的內容：

但是之前說Jsoup可以向操作JS一樣對網頁內容進行提取，所以我們在獲取爬取內容之前要先對內容進行解析：

public class Demo {
    public static void main(String[] args) throws IOException {
        Connection connect = Jsoup.connect("https://www.cnblogs.com/hanzhe")
                .header("user-agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.96 Safari/537.36 Edg/88.0.705.56");
        // 使用parse函數對爬取到的內容進行解析
        Element body = connect.execute().parse().body();
        System.out.println(body);
    }
}

明顯看到解析后的HTML被格式化過，看着非常整齊，而且返回值也從字符串變成了ELement實例，可以通過操作實例實現內容篩選

測試爬取頁面隨筆

打開F12開發者工具，嘗試獲取到與隨筆標題相關的信息：

觀察發現每個隨筆的標題都使用postTitle2 vertical-middle兩個class進行修飾的，我們可以使用選擇器來找到所有標題：

我們就用該選擇器在Jsoup中爬取所有隨筆標題：

public class Demo {
    public static void main(String[] args) throws IOException {
        Connection connect = Jsoup.connect("https://www.cnblogs.com/hanzhe")
                .header("user-agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.96 Safari/537.36 Edg/88.0.705.56");
        Element body = connect.execute().parse().body();
        Elements elementsByClass = body.getElementsByClass("postTitle2 vertical-middle");
        elementsByClass.forEach(item->{
            System.out.println(item.text());
        });
    }
}

實戰：爬取筆趣閣小說：

被選中的幸運兒：http://www.paoshuzw.com/26/26874/

具體Java代碼：

public class 筆趣閣爬取小說 {

    // 爬取目標網址：從第一章開始爬取，直至最后一章
    private static String url = "http://www.paoshuzw.com/26/26874/13244872.html";
    // 輸出文件名稱(一般為書名)，如果僅僅是想拿來用改這兩個參數就夠了
    private static String fileName = "倉元圖";


    // 空格，四格位置
    private static String space = "    ";
    // 文件輸出流
    private static FileWriter writer;
    // 計數器
    private static int pageCount = 1;

    // 啟動類函數
    public static void main(String[] args) throws Exception {
        // 初始化程序
        getWriter();
        long l = System.currentTimeMillis();
        // 循環爬取小說
        do {
            Element element = nextPage(url);
            outputToFile(element);
            url = hasNext(element);
        } while (url != null);
        writer.close();
        long time = (System.currentTimeMillis() - l) / 1000;
        System.out.println("\n\n 成功爬取所有章節！耗時" + time + "秒");
    }

    // 爬取頁面
    private static Element nextPage(String url) throws Exception{
        // 獲取連接實例，偽造瀏覽器身份
        Connection conn = Jsoup.connect(url)
                .header("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9")
                .header("Accept-Encoding", "gzip, deflate")
                .header("Accept-Language", "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6")
                .header("Cache-Control", "max-age=0")
                .header("Connection", "keep-alive")
                .header("Host", url)
                .header("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.96 Safari/537.36 Edg/88.0.705.56");
        return conn.execute().parse().body();
    }

    // 獲取當前章節標題
    private static String getTitle(Element element) {
        return element.select(".bookname h1").text();
    }

    // 獲取章節具體內容
    private static String getContent(Element element) {
        // 刪除底部P標簽的廣告內容
        element.getElementsByTag("p").remove();
        // 獲取到ID為content的所有HTML內容
        String body = element.select("#content").html();
        // 對body進行處理，返回正常格式的內容
        body = body.replace("&nbsp;&nbsp;&nbsp;&nbsp;", space);
        body = body.replace("<br>", "");
        return body.replace(" \n \n", "\r\n");
    }

    // 是否有下一頁？有返回下一頁URL地址，沒有就返回NULL
    private static String hasNext(Element element) {
        // 找到"下一章"的按鈕，獲取跳轉的目標地址
        Elements div = element.getElementsByClass("bottem2");
        Element a = div.get(0).getElementsByTag("a").get(3);
        String href = a.attr("href");
        // 通過觀察存在下一章的時候URL會以.html結尾，不存在時會跳轉到首頁，通過這個特點判斷是否存在下一章
        return href.endsWith(".html") ? "http://www.paoshuzw.com" + href : null;
    }

    // 獲取輸出流
    private static void getWriter() throws IOException {
        String path = "D:/" + fileName + ".txt";
        File file = new File(path);
        if (file.exists()) {
            System.out.println("目標書籍已存在！請修改文件名稱或刪除原書籍：" + path);
            System.exit(0);
        }
        writer = new FileWriter(file);
    }

    // 輸出到文件
    private static void outputToFile(Element element) throws IOException {
        String title = getTitle(element);
        String content = getContent(element);
        String text = space + title + "\r\n\r\n" + content;
        writer.write(text);
        writer.flush();
        System.out.println("==>>【" + title + "】爬取完成，正在爬取下一章（第" + pageCount++ + "次操作）");
    }

}

免責聲明！

本站轉載的文章為個人學習借鑒使用，本站對版權不負任何法律責任。如果侵犯了您的隱私權益，請聯系本站郵箱yoyou2525@163.com刪除。

猜您在找 python爬取筆趣閣小說 Python爬蟲練習（一）爬取筆趣閣小說（搜索+爬取） python爬去筆趣閣完整一本小說 Python爬蟲練習:抓取筆趣閣小說(一) 爬蟲入門實例：利用requests庫爬取筆趣小說網用python爬取新筆趣閣的所有小說，使用xpath提取 Java爬蟲：用java爬取小說 Golang 簡單爬蟲實現，爬取小說 java網絡爬蟲-利用phantomjs和jsoup爬取動態ajax加載頁面 python爬蟲之爬取小說（一）