使用Java Jsoup爬取網頁內容(存入本地並從本地讀取)


GetPageInfo 獲取數據、存入本地、從本地讀取數據


import lombok.SneakyThrows;
import org.jsoup.Connection;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;

import java.io.*;

public class GetPageInfo {
    public static void main(String[] args) throws Exception {
        // 獲取網頁數據並保存到本地
        String casData = "";        // 創建一個StringBuilder用於存儲爬取到並處理過的數據,存入本地
        int count = 0;
        Integer pages = 10;   // 爬取的頁數
        for(int i = 1;i  <= pages;i ++){
            String url = "https://www.xxx.com.cn/list/?p=" + (i + "");
            // url = url.replace("pageNum",i + "");
            String cas = getCas(url);       // 調用爬取方法傳入url
            System.out.println("第" + i+"頁數據獲取成功");
            casData += cas + " ";        // 字符串追加 每次追加完成后加個空格
            // 每10頁寫入一次,提高效率
            if(i % 10 == 0){
                if(count == 0){
                    //將文件寫入本地
                    writeOcrStrtoFile(casData,"F:\\nistCasData","cas.txt");
                    System.out.println("第" + i/10 +"次保存成功");
                    casData = "";
                    count ++;
                }else{
                    String tempRead = readFileByLines("F:\\nistCasData\\cas.txt");
                    tempRead += casData;
                    writeOcrStrtoFile(tempRead,"F:\\nistCasData","cas.txt");
                    System.out.println("第" + i/10 +"次保存成功");
                    casData = "";
                }
            }
        }

        // 將剩下i % 10 != 0的數據寫入
        // 讀取本地文件
        String readData = readFileByLines("F:\\nistCasData\\cas.txt");
        readData += casData;
        writeOcrStrtoFile(readData,"F:\\nistCasData","cas.txt");

//        String[] arr = readData.split("\\s+");  // 分割一個或者多個空格
//        for(int i = 0;i < arr.length;i ++){
//            System.out.println(i + ":" + arr[i]);
//        }
    }

    /**
     * 獲取網頁數據
     * @param url
     */
    @SneakyThrows
    public static String getCas(String url){
        // 如果報錯,忽略url的https證書;http開頭的應該可以不用處理
        HttpsUrlValidator.retrieveResponseFromServer(url);
        // 加入url並編寫請求頭,打開瀏覽器控制台照着寫
        Connection.Response response = Jsoup
                .connect(url)
                .header("Accept","text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9")
                .header("Accept-Encoding","*/*")
                .header("Accept-Language","zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7,ja-JP;q=0.6,ja;q=0.5,ko-KR;q=0.4,ko;q=0.3")
                .header("Connection","keep-alive")
                .header("Content-Type","application/json;charset=UTF-8")
                .header("User-Agent","Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.135 Safari/537.36")
                .timeout(10000)     // 設置超時時間
                .ignoreContentType(true)
                .execute();

        String html = response.body();      // 獲取到的html字符串
        Document doc = Jsoup.parse(html); // 使用jsoup 進行語言轉換
//        System.out.println(doc.select(".alink").size());        // 查看class="alink"的個數
        String cas = doc.select(".alink").text();       // 獲取class="alink"的數據
        return  cas;
    }

    /**
     * 保存文件到本地
     * @param result  需要寫入的數據
     * @param outPath   保存的路徑
     * @param outFileName   保存的文件名
     * @throws Exception
     */
    public static void writeOcrStrtoFile(String result, String outPath, String outFileName) throws Exception {
        File dir = new File(outPath);
        if(!dir.exists()) {
            dir.mkdirs();
        }
        File txt = new File(outPath + "/" + outFileName);
        // 先刪除;否則會直接追加在之前的內容后面,成幾何倍數增長
        if (txt.isFile() && txt.exists()) {
            txt.delete();
        }
        // 再創建
        if (!txt.exists()) {
            txt.createNewFile();
        }
        byte bytes[] = new byte[512];
        bytes = result.getBytes();
        int b = bytes.length; // 是字節的長度,不是字符串的長度
        FileOutputStream fos = new FileOutputStream(txt);
        fos.write(bytes);
        fos.flush();
        fos.close();
    }

    /**
     * 讀取本地文件(按行讀取),因為存的時候沒換行,所以按行讀取
     * @param fileName  文件名
     */
    public static String readFileByLines(String fileName) {
        File file = new File(fileName);
        String readData = "";
        BufferedReader reader = null;
        try {
            String tempString = null;
            reader = new BufferedReader(new FileReader(file));
            // 一次讀一行,讀入null時文件結束
            while ((tempString = reader.readLine()) != null) {
                readData += tempString;
            }
            reader.close();
        } catch (Exception e) {
            e.printStackTrace();
        } finally {
            if (reader != null) {
                try {
                    reader.close();
                } catch (IOException e1) {
                    e1.printStackTrace();
                }
            }
        }
        return readData;

    }


}

忽略https證書(http應該不需要,沒試過)


import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.URL;

import javax.net.ssl.HostnameVerifier;
import javax.net.ssl.HttpsURLConnection;
import javax.net.ssl.SSLSession;


public class HttpsUrlValidator {

    static HostnameVerifier hv = new HostnameVerifier() {
        public boolean verify(String urlHostName, SSLSession session) {
            System.out.println("Warning: URL Host: " + urlHostName + " vs. "
                    + session.getPeerHost());
            return true;
        }
    };

    public final static String retrieveResponseFromServer(final String url) {
        HttpURLConnection connection = null;

        try {
            URL validationUrl = new URL(url);
            trustAllHttpsCertificates();
            HttpsURLConnection.setDefaultHostnameVerifier(hv);

            connection = (HttpURLConnection) validationUrl.openConnection();
            final BufferedReader in = new BufferedReader(new InputStreamReader(
                    connection.getInputStream()));

            String line;
            final StringBuffer stringBuffer = new StringBuffer(255);

            synchronized (stringBuffer) {
                while ((line = in.readLine()) != null) {
                    stringBuffer.append(line);
                    stringBuffer.append("\n");
                }
                return stringBuffer.toString();
            }

        } catch (final IOException e) {
            System.out.println(e.getMessage());
            return null;
        } catch (final Exception e1){
            System.out.println(e1.getMessage());
            return null;
        }finally {
            if (connection != null) {
                connection.disconnect();
            }
        }
    }

    public static void trustAllHttpsCertificates() throws Exception {
        javax.net.ssl.TrustManager[] trustAllCerts = new javax.net.ssl.TrustManager[1];
        javax.net.ssl.TrustManager tm = new miTM();
        trustAllCerts[0] = tm;
        javax.net.ssl.SSLContext sc = javax.net.ssl.SSLContext
                .getInstance("SSL");
        sc.init(null, trustAllCerts, null);
        javax.net.ssl.HttpsURLConnection.setDefaultSSLSocketFactory(sc
                .getSocketFactory());
    }

    static class miTM implements javax.net.ssl.TrustManager,
            javax.net.ssl.X509TrustManager {
        public java.security.cert.X509Certificate[] getAcceptedIssuers() {
            return null;
        }

        public boolean isServerTrusted(
                java.security.cert.X509Certificate[] certs) {
            return true;
        }

        public boolean isClientTrusted(
                java.security.cert.X509Certificate[] certs) {
            return true;
        }

        public void checkServerTrusted(
                java.security.cert.X509Certificate[] certs, String authType)
                throws java.security.cert.CertificateException {
            return;
        }

        public void checkClientTrusted(
                java.security.cert.X509Certificate[] certs, String authType)
                throws java.security.cert.CertificateException {
            return;
        }
    }

}


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM