之前在公司項目使用了webMagic爬蟲,對某個網站爬取數據,包括圖片下載保存。
現在想想好像也不怎么了解Webmagic,差不多忘掉了。。然后就重新簡單的寫個例子試試。 應該晚點會用webmagic重新來完成之前任務。 (閑着也是閑着,溫故而知新嘛)
用到webMagic爬蟲, 最主要的就是 實現 PageProcessor 這個接口, 實現 process這個方法。
還要掌握正則表達式,css,xpath等。 (對正則不咋熟的我表示很尷尬)
然后。。。
上代碼~
github地址:https://github.com/fightingFisher/webmagicTest.git
package com.xu.webmagic.main; import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Spider; import us.codecraft.webmagic.processor.PageProcessor; import us.codecraft.webmagic.selector.Selectable; import com.xu.webmagic.TestPipeline; public class TestProcessor implements PageProcessor { private Site site = Site.me().setRetryTimes(10).setSleepTime(1000); // 列表頁 list_url = "http://www.cnblogs.com/dick159/default.html?page="; // private Set<String> done_url = new HashSet<String>(); // 詳情頁url = "http://www.cnblogs.com/dick159/p/^\\d{7}$.html"; @Override public void process(Page page) { String detail_urls_Xpath = "//*[@class='postTitle']/a[@class='postTitle2']/@href"; String next_page_xpath = "//*[@id='nav_next_page']/a/@href"; String next_page_css = "#homepage_top_pager > div:nth-child(1) > a:nth-child(7)"; String title_xpath = "//h1[@class='postTitle']/a/text()"; String date_xpath = "//span[@id='post-date']/text()"; page.putField("title", page.getHtml().xpath(title_xpath).toString()); if (page.getResultItems().get("title") == null) { page.setSkip(true); } page.putField("date", page.getHtml().xpath(date_xpath).toString()); if (page.getHtml().xpath(detail_urls_Xpath).match()) { Selectable detailUrls = page.getHtml().xpath(detail_urls_Xpath); page.addTargetRequests(detailUrls.all()); } if (page.getHtml().xpath(next_page_xpath).match()) { Selectable nextPageUrl = page.getHtml().xpath(next_page_xpath); page.addTargetRequests(nextPageUrl.all()); } else if (page.getHtml().css(next_page_css).match()) { Selectable nextPageUrl = page.getHtml().css(next_page_css).links(); page.addTargetRequests(nextPageUrl.all()); } } @Override public Site getSite() { return this.site; } @SuppressWarnings("deprecation") public static void main(String[] args) { TestProcessor processor = new TestProcessor(); Spider.create(processor) .addUrl("http://www.cnblogs.com/dick159/default.html?page=1") .pipeline(new TestPipeline()).thread(5).run(); } }
package com.xu.webmagic; import java.util.Map; import us.codecraft.webmagic.ResultItems; import us.codecraft.webmagic.Task; import us.codecraft.webmagic.pipeline.Pipeline; public class TestPipeline implements Pipeline { @Override public void process(ResultItems resultitems, Task task) { System.out.println("get page: " + resultitems.getRequest().getUrl()); // 爬取的數據 是以 Map<K,V>的結構保存起來的。 // 在此對爬取的數據 進行操作。 for (Map.Entry<String, Object> entry : resultitems.getAll().entrySet()) { System.out.println(entry.getKey() + "---" + entry.getValue()); } } }
附上結果圖:

