java 爬取網頁內容。標題、圖片等

本文轉載自查看原文 2021-09-24 12:26 108

package com.fh.util;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.URL;
import java.net.URLConnection;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
 * 說明：爬取網頁
 * 作者：FH Admin
 * from：fhadmin.cn
 */
public class GetWeb {

    /**
     * 獲取當前網頁的code
     * 
     * @param httpUrl
     *            網頁地址
     * @return
     * @throws IOException
     */
    public static String getHtmlCode(String httpUrl) throws IOException {
        String content = "";         // 定義字符串content
        URL url = new URL(httpUrl); // 生成傳入的URL的對象
        BufferedReader reader = new BufferedReader(new InputStreamReader(
                url.openStream(), "utf-8"));// 獲得當前url的字節流（緩沖）
        String input;
        while ((input = reader.readLine()) != null) { // 當前行存在數據時
            content += input;         // 將讀取數據賦給content
        }
        reader.close();             // 關閉緩沖區
        return content;
    }

    /**
     * 把網頁中的所有圖片的完整路徑放到list里面
     * 
     * @param wwwurl
     *            要爬的網頁連接
     * @throws IOException
     */
    public static List<String> getImagePathList(String httpUrl)
            throws IOException {

        // 通過擴展名匹配網頁圖片的正則表達式
        // String searchImgReg =
        // "(?x)(src|SRC|background|BACKGROUND)=('|\")/?(([\\w-]+/)*([\\w-]+\\.(jpg|JPG|png|PNG|gif|GIF)))('|\")";
        // String searchImgReg2 =
        // "(?x)(src|SRC|background|BACKGROUND)=('|\")(http://([\\w-]+\\.)+[\\w-]+(:[0-9]+)*(/[\\w-]+)*(/[\\w-]+\\.(jpg|JPG|png|PNG|gif|GIF)))('|\")";
        // 通過img標簽匹配網頁圖片的正則表達式
        String searchImgReg = "<(img|IMG)\\b[^>]*\\b(src|SRC|src2|SRC2)\\b\\s*=\\s*('|\")?([^'\"\n\r\f>]+(\\.jpg|\\.bmp|\\.eps|\\.gif|\\.mif|\\.miff|\\.png|\\.tif|\\.tiff|\\.svg|\\.wmf|\\.jpe|\\.jpeg|\\.dib|\\.ico|\\.tga|\\.cut|\\.pic)\\b)[^>]*>";
        List<String> imgList = new ArrayList<String>();     // 存放圖片的list
        String content = null;
        content = getHtmlCode(httpUrl);                        // 獲得content
        Pattern pattern = Pattern.compile(searchImgReg);     // 講編譯的正則表達式對象賦給pattern
        Matcher matcher = pattern.matcher(content);         // 對字符串content執行正則表達式
        while (matcher.find()) {
            String quote = matcher.group(3);
            String imgsrc = (quote == null || quote.trim().length() == 0) ? matcher.group(4).split("\\s+")[0] : matcher.group(4);
            if (!imgsrc.startsWith("http://") && !imgsrc.startsWith("https://")) {             // 檢驗地址是否http://
                String[] httpUrlarr = httpUrl.split("/");
                String wwwhost = httpUrlarr[0] + "//" + httpUrlarr[2]; //獲取域名完整地址
                if(!isNetFileAvailable(wwwhost + "/" + imgsrc)){
                    for(int i=3;i<httpUrlarr.length;i++){
                        wwwhost = wwwhost + "/" + httpUrlarr[i];
                        if(isNetFileAvailable(wwwhost + "/" + imgsrc)){
                            imgsrc = wwwhost + "/" + imgsrc;
                            break;
                        }
                    }
                }else{
                    imgsrc = wwwhost + "/" + imgsrc;
                }
            }
            imgList.add(imgsrc);
        }
        return imgList;

    }

    /**
     * 獲取網頁的標題
     * 
     * @param httpUrl
     *            要爬的網頁連接
     * @return
     */
    public static String getTilte(String httpUrl) {

        String searchTitle = "(<title>|<TITLE>)(.*?)(</title>|</TITLE>)"; // 獲取網頁的標題的正則表達式
        Pattern pattern = Pattern.compile(searchTitle); // 獲得content
        try {
            Matcher matcher = pattern.matcher(getHtmlCode(httpUrl));
            while (matcher.find()) {
                return matcher.group(2);
            }
        } catch (IOException e) {
            e.printStackTrace();
        }
        return null;

    }

    /**
     * 檢測網絡資源是否存在　
     * 
     * @param strUrl
     * @return
     */
    public static boolean isNetFileAvailable(String strUrl) {
        InputStream netFileInputStream = null;
        try {
            URL url = new URL(strUrl);
            URLConnection urlConn = url.openConnection();
            netFileInputStream = urlConn.getInputStream();
            if (null != netFileInputStream) {
                return true;
            } else {
                return false;
            }
        } catch (IOException e) {
            return false;
        } finally {
            try {
                if (netFileInputStream != null)
                    netFileInputStream.close();
            } catch (IOException e) {
            }
        }
    }
}

免責聲明！

本站轉載的文章為個人學習借鑒使用，本站對版權不負任何法律責任。如果侵犯了您的隱私權益，請聯系本站郵箱yoyou2525@163.com刪除。

猜您在找 java爬蟲爬取網頁內容前，對網頁內容的編碼格式進行判斷的方式學習使用Java的webmagic框架爬取網頁內容 python爬取網頁內容demo 如何使用Jsoup爬取網頁內容網頁內容爬取：如何提取正文內容網頁內容爬取：如何提取正文內容 BEAUTIFULSOUP的輸出使用Java Jsoup爬取網頁內容（存入本地並從本地讀取） java爬取網頁內容簡單例子（2）——附jsoup的select用法詳解 java爬取網頁內容簡單例子（1）——使用正則表達式 python的requests模塊爬取網頁內容

java 爬取網頁內容。 標題、圖片等

免責聲明！

java 爬取網頁內容。標題、圖片等