項目的目錄結構
核心源碼:
package cn.edu.zyt.spider; import java.io.BufferedInputStream; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.IOException; import java.io.InputStream; import java.util.Properties; import cn.edu.zyt.spider.model.SpiderParams; import cn.edu.zyt.spider.queue.UrlQueue; import cn.edu.zyt.spider.worker.SpiderWorker; public class SpiderStarter { public static void main(String[] args){ System.setProperty("java.net.useSystemProxies", "true"); System.setProperty("http.proxyHost", "113.128.9.37"); System.setProperty("http.proxyPort", "9999"); System.setProperty("https.proxyHost", "113.128.9.37"); System.setProperty("https.proxyPort", "9999"); // 初始化配置參數 initializeParams(); // 初始化爬取隊列 initializeQueue(); // 創建worker線程並啟動 for(int i = 1; i <= SpiderParams.WORKER_NUM; i++){ new Thread(new SpiderWorker(i)).start(); } } /** * 初始化配置文件參數 */ private static void initializeParams(){ InputStream in; try { in = new BufferedInputStream(new FileInputStream("conf/spider.properties")); Properties properties = new Properties(); properties.load(in); // 從配置文件中讀取參數 SpiderParams.WORKER_NUM = Integer.parseInt(properties.getProperty("spider.threadNum")); SpiderParams.DEYLAY_TIME = Integer.parseInt(properties.getProperty("spider.fetchDelay")); in.close(); } catch (FileNotFoundException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } } /** * 准備初始的爬取鏈接 */ private static void initializeQueue(){ // 例如,需要抓取天下糧倉信息,根據鏈接規則生成URLs放入帶抓取隊列http://www.cofeed.com/national_1.html for(int i = 0; i < 3; i += 1){ UrlQueue.addElement("http://www.cofeed.com/national_" + i+".html"); } } }
實現效果圖:
由於頁面代碼較多就不一一粘貼了,獲取完整源碼可在博客下方留言哈