java爬取網站中所有網頁的源代碼和鏈接

本文轉載自查看原文 2019-11-19 16:25 1127

1. 網絡爬蟲是一個自動提取網頁的程序，它為搜索引擎從萬維網上下載網頁，是搜索引擎的重要組成。傳統爬蟲從一個或若干初始網頁的URL開始，獲得初始網頁上的URL，在抓取網頁的過程中，不斷從當前頁面上抽取新的URL放入隊列，直到滿足系統的一定停止條件。

所以主要使用遞歸遍歷完成對每個網頁內鏈接的獲取和源碼的獲取，然后剔除重復鏈接

數據爬取后主要使用txt文件儲存，根據網址的路徑生成想應文件路徑

package com.test;

import java.io.*;
import java.net.URL;
import java.net.URLConnection;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
 
/**
 * java實現爬蟲
 */
public class SpiderDemo1 {
 
    //網站主網頁鏈接
    private final static String theURL = "http://www.jyyishu.cn";
    //今日日期，用於標記日志
    private final static String theTIME = new SimpleDateFormat("yyyy-MM-dd").format(new Date());
    //網頁鏈接文件路徑
    private final static String theFILE = "L:/html/jy1/" + theTIME + "/URL.txt";
    //網頁源碼路徑
    private final static String thePATH = "L:/html/jy1/" + theTIME + "/code";
    //正則表達式，用於判斷是否是一個網址
    private final static String theREGEX= "(http|https)://[\\w+\\.?/?]+\\.[A-Za-z]+";
 
    /**
     * 啟動類
     * @param args
     */
    public static void main(String[] args) {
        found();
        System.out.println("網站爬取完成");
    }
 
 
    public static void found() {
        PrintWriter pw = null;
        try{
            //創建文件目錄
            File fileDir = new File(thePATH);
            if (!fileDir.exists()) {
                fileDir.mkdirs();
            }
 
            //創建網站網頁鏈接文件
            pw = new PrintWriter(new FileWriter(theFILE),true);
 
            //使用遞歸遍歷網站的每個網頁
            spiderURL(theURL, pw);
        } catch (IOException e) {
            e.printStackTrace();
        } finally {
            try {
                if(pw != null) {
                    pw.close();
                }
            } catch(Exception e) {
                e.printStackTrace();
            }
        }
    }
 
 
    /**
     * 爬取該網頁源碼和網頁內連接
     * @param url 該網頁網址
     * @param tpw 對網站網頁網址文件連接的io流
     */
    public static void spiderURL(String url, PrintWriter tpw){
        URL realURL=null;
        URLConnection connection=null;
        BufferedReader br=null;
        PrintWriter pw=null;
        PrintWriter pw1=null;
 
        Pattern pattern=Pattern.compile(theREGEX);
        try{
            realURL=new URL(url);
            connection=realURL.openConnection();
 
            //生成文件夾
            String src = thePATH + url.substring(theURL.length());
            File fileDir = new File(src);
            if (!fileDir.exists()) {
                fileDir.mkdirs();
            }
 
            //生成源代碼文件
            pw = new PrintWriter(new FileWriter(src + "/Test.txt"),true);
            pw1 = tpw;
 
            //爬取網頁文件
            br=new BufferedReader(new InputStreamReader(connection.getInputStream()));
            String line=null;
            while((line=br.readLine())!=null){
                //把爬取的源碼寫入文件
                pw.println(line);
                System.out.println("爬取網頁" + url + "成功");
                Matcher matcher=pattern.matcher(line);
                //判斷是否是一個網址
                while(matcher.find()){
                    //判斷網址是否以網站主網址為前綴，防止爬到其他網站,並判斷是否和原先網址重復
                    if(matcher.group().startsWith(theURL) && examine(matcher.group())) {
                        //把爬取的網址寫入文件
                        pw1.println(matcher.group());
                        spiderURL(matcher.group(), pw1);
                    }
                }
                System.out.println("網頁" + url + "內鏈接爬取完成");
            }
    }catch(Exception e){
        e.printStackTrace();
    }finally {
            try {
                if(br != null) {
                    br.close();
                }
                if(pw != null) {
                    pw.close();
                }
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
    }
 
 
    /**
     * 判斷是否和以儲存網址相同
     * @param str 需判斷的網址
     * @return 是否重復
     */
    public static boolean examine(String str) {
        BufferedReader br = null;
        String str1;
        try {
            br = new BufferedReader(new FileReader(theFILE));
 
//            //針對該網站無用網頁的屏蔽
//            if(str.startsWith("http://www.jyyishu.cn/artnews/")) {
//                return false;
//            }
            
            //循環文件中每一行的網址，判斷是否重復，重復則退出
            while((str1 = br.readLine()) != null) {
                if(str.equals(str1)) {
                    return false;
                }
            }
        } catch (IOException e) {
            e.printStackTrace();
        } finally {
            try{
                if(br != null) {
                    br.close();
                }
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
        return true;
    }
}

2. 爬取后的數據

部分鏈接：

網頁數據：

免責聲明！

本站轉載的文章為個人學習借鑒使用，本站對版權不負任何法律責任。如果侵犯了您的隱私權益，請聯系本站郵箱yoyou2525@163.com刪除。

猜您在找 JAVA爬取網頁郵箱 python：爬取博主的所有文章的鏈接、標題和內容 python 實現爬取網站下所有URL python 實現爬取網站下所有URL 針對源代碼和檢查元素不一致的網頁爬蟲——利用Selenium、PhantomJS、bs4爬取12306的列車途徑站信息網頁源碼爬取爬取https網站如何修改網頁源代碼學習使用Java的webmagic框架爬取網頁內容 Java根據一個網址鏈接獲取源代碼