動態網頁圖片爬取--HtmlUtil+Jsoup

本文轉載自查看原文 2019-04-15 17:31 498 爬蟲/ Java

根據網頁的URL爬取網頁上的圖片，並打包生成壓縮文件（HtmlUtil+Jsoup+ZipOutPutStream）

1.獲取網頁JS動態加載后的內容用到了HtmlUtil

2.根據解析后的XML獲取指定標簽內容用到了Jsoup

3.最后生成壓縮文件用到了ZipOutputStream

package com.wl.test3;

import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.net.HttpURLConnection;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList;
import java.util.List;
import java.util.zip.ZipEntry;
import java.util.zip.ZipOutputStream;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import com.gargoylesoftware.htmlunit.BrowserVersion;
import com.gargoylesoftware.htmlunit.NicelyResynchronizingAjaxController;
import com.gargoylesoftware.htmlunit.WebClient;
import com.gargoylesoftware.htmlunit.html.HtmlPage;

/**
 * 根據網頁URL獲取其網頁上的圖片，並生成打包文件 1.根據網頁URL解析HTML獲取指定標簽內容圖片SRC--HtmlUtil+Jsoup
 * HtmlUtil根據URl模擬瀏覽器獲取網頁js動態加載后的內容，Jsoup根據HtmlUtil解析后的XML文件獲取指定的標簽內容
 * 2.根據圖片地址下載並生成壓縮文件--ZipOutPutStream 根據圖片的地址下載圖片到指定文件夾，使用ZipOutPutStream壓縮流將其壓縮
 * 
 * @author Administrator
 *
 */
public class PicFileCompression {

    /**
     * HtmlUtil+Jsoup根據網頁URL獲取動態加載后的頁面並得到指定標簽的內容
     * @param url 網頁地址
     * @return
     * @throws Exception
     */
    public static List<String> getPicSrc(String url) throws Exception {
        List<String> srcList = new ArrayList<String>();
        /** 創建模擬指定瀏覽器的客戶端對象 */
        final WebClient webClient = new WebClient(BrowserVersion.CHROME);
        /** JS執行出錯不拋出異常 */
        webClient.getOptions().setThrowExceptionOnScriptError(false);
        /** HTTP狀態不是200時不拋出異常 */
        webClient.getOptions().setThrowExceptionOnFailingStatusCode(false);
        /** 不啟用CSS */
        webClient.getOptions().setCssEnabled(false);
        /** 啟用JS(非常重要) */
        webClient.getOptions().setJavaScriptEnabled(true);
        /** 支持AJAX(非常重要) */
        webClient.setAjaxController(new NicelyResynchronizingAjaxController());
        /** JS執行需要一定時間，設置等待時間(非常重要) */
        webClient.waitForBackgroundJavaScript(10000);
        // webClient.getOptions().setActiveXNative(false);
        // webClient.getOptions().setTimeout(10000);
        /** 加載網頁 */
        HtmlPage page = webClient.getPage(url);
        // Thread.sleep(3000);
        /** 將加載的網頁轉換成XML形式 */
        String pageXml = page.asXml();
        /** Jsoup獲取HTML文檔 */
        Document document = Jsoup.parse(pageXml);
        /** 直接獲取IMG標簽 */
        List<Element> infoListFile = document.getElementsByTag("img");
        /** 獲取IMG的SRC屬性 */
        for (Element img : infoListFile) {
            String src = img.attr("src");
            if (!src.isEmpty()) {
                if ((!src.contains("http:")) && (!src.contains("https:"))) {
                    src = "http:" + src;
                }
                System.out.println("圖片地址：" + src);
                srcList.add(src);
            }
        }
        webClient.close();
        return srcList;
    }

    /**
     * 根據圖片的URL獲取圖片，並打包生成一個壓縮文件 
     * @param src 圖片地址
     * @throws Exception
     */
    public static void picZip(List<String> src) throws Exception {
        /** 圖片下載后的保存地址 */
        File file = new File("F:/Pictures.zip");
        /** 若目錄或文件不存在，則創建一個 */
        if (!file.exists()) {
            file.getParentFile().mkdirs();
            file.createNewFile();
        }
        /** 壓縮流，將寫入輸出流的內容壓縮輸出，生成壓縮包 */
        ZipOutputStream out = new ZipOutputStream(new FileOutputStream(file));
        for (String s : src) {
            /** 每一張圖片壓縮后的文件名稱 */
            String picName = s.substring(s.lastIndexOf("/") + 1, s.length());
            URL url = new URL(s);
            /** 獲取URl連接 */
            HttpURLConnection connection = (HttpURLConnection) url.openConnection();
            connection.connect();
            /** 獲取輸入流 */
            InputStream input = connection.getInputStream();
            /** 設置每一個壓縮內容的名稱 */
            out.putNextEntry(new ZipEntry(picName));
            byte[] buffer = new byte[10];
            int length;
            /** 將輸入流中的內容讀取出來，並寫入輸出流 */
            while ((length = input.read(buffer)) > 0) {
                out.write(buffer, 0, length);
            }
            input.close();
        }
        out.close();
        System.out.println("壓縮包已生成，地址：" + file.getAbsolutePath());
    }

    public static void main(String[] args) throws Exception {
        // TODO Auto-generated method stub
        List<String> picSrc = getPicSrc("https://ent.sina.com.cn/film/");
        if (picSrc.size() > 0) {
            picZip(picSrc);
        } else {
            System.out.println("未獲取到網頁上任何圖片！");
        }

    }

}

免責聲明！

本站轉載的文章為個人學習借鑒使用，本站對版權不負任何法律責任。如果侵犯了您的隱私權益，請聯系本站郵箱yoyou2525@163.com刪除。

猜您在找 使用Jsoup和htmlunit爬取動態網頁爬蟲入門（三）——動態網頁爬取：爬取pexel上的圖片 Python爬蟲爬取動態網頁動態網頁爬取方法動態網頁爬取流程總結 python動態網頁的爬取爬取京東網頁評論（動態網頁） Python爬取javascript(js)動態網頁 R語言爬取動態網頁之環境准備 python爬取動態網頁數據，詳解