java實現多線程使用多個代理ip的方式爬取網頁頁面內容

本文轉載自查看原文 2019-04-01 16:50 1013

項目的目錄結構

核心源碼：

package cn.edu.zyt.spider;

import java.io.BufferedInputStream;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.util.Properties;

import cn.edu.zyt.spider.model.SpiderParams;
import cn.edu.zyt.spider.queue.UrlQueue;
import cn.edu.zyt.spider.worker.SpiderWorker;


public class SpiderStarter {

    public static void main(String[] args){
        
        
        System.setProperty("java.net.useSystemProxies", "true");
        System.setProperty("http.proxyHost", "113.128.9.37");
        System.setProperty("http.proxyPort", "9999");
        System.setProperty("https.proxyHost", "113.128.9.37");
        System.setProperty("https.proxyPort", "9999");

        // 初始化配置參數
        initializeParams();

        // 初始化爬取隊列
        initializeQueue();
        
        // 創建worker線程並啟動
        for(int i = 1; i <= SpiderParams.WORKER_NUM; i++){
            new Thread(new SpiderWorker(i)).start();
        }
    }
    
    /**
     * 初始化配置文件參數
     */
    private static void initializeParams(){
        InputStream in;
        try {
            in = new BufferedInputStream(new FileInputStream("conf/spider.properties"));
            Properties properties = new Properties();
            properties.load(in);
            
            // 從配置文件中讀取參數
            SpiderParams.WORKER_NUM = Integer.parseInt(properties.getProperty("spider.threadNum"));
            SpiderParams.DEYLAY_TIME = Integer.parseInt(properties.getProperty("spider.fetchDelay"));

            in.close();
        } 
        catch (FileNotFoundException e) {
            e.printStackTrace();
        } 
        catch (IOException e) {
            e.printStackTrace();
        }
    }
    
    /**
     * 准備初始的爬取鏈接
     */
    private static void initializeQueue(){
        // 例如，需要抓取天下糧倉信息，根據鏈接規則生成URLs放入帶抓取隊列http://www.cofeed.com/national_1.html
        
        for(int i = 0; i < 3; i += 1){
            UrlQueue.addElement("http://www.cofeed.com/national_" + i+".html");
        }
    }
}

實現效果圖：

由於頁面代碼較多就不一一粘貼了，獲取完整源碼可在博客下方留言哈

免責聲明！

本站轉載的文章為個人學習借鑒使用，本站對版權不負任何法律責任。如果侵犯了您的隱私權益，請聯系本站郵箱yoyou2525@163.com刪除。

猜您在找 如何使用Jsoup爬取網頁內容爬蟲實例(二)：多線程，多進程對網頁的爬取 Python多線程爬蟲爬取網頁圖片 scrapy使用爬取多個頁面使用JAVA爬取網頁圖片 java爬取網頁內容簡單例子（1）——使用正則表達式 Java兩種方式簡單實現：爬取網頁並且保存 python網絡爬蟲之使用scrapy自動爬取多個網頁使用HTTPURLConnection模擬登陸，爬取網頁內容異步、多線程、Java爬取某小姐姐網站圖片