1 import java.util.List; 2 import cn.wanghaomiao.xpath.exception.XpathSyntaxErrorException; 3 import cn.wanghaomiao.xpath.model.JXDocument; 4 import us.codecraft.webmagic.Page; 5 import us.codecraft.webmagic.Request; 6 import us.codecraft.webmagic.Site; 7 import us.codecraft.webmagic.Spider; 8 import us.codecraft.webmagic.processor.PageProcessor; 9 10 //当前爬虫为爬取百度百科上,人物分类页面下的人物信息,只是简单的例子 11 public class BaikeProcessor implements PageProcessor{ 12 13 //总页数 14 private int totalPage = 10; 15 16 //当前页数 17 private int currentPage = 1; 18 19 //抓取网站的相关配置,包括编码、抓取间隔、重试次数等 20 private Site site = Site.me().setRetryTimes(3).setSleepTime(1000).setTimeOut(20000).setCharset("utf-8"); 21 22 23 // process是定制爬虫逻辑的核心接口,在这里编写抽取逻辑 24 public void process(Page page) { 25 26 dealPage(page); 27 28 } 29 30 public Site getSite() { 31 return site; 32 } 33 34 //执行测试 35 public static void main(String[] args) { 36 37 Spider.create(new BaikeProcessor()) 38 .addUrl("http://baike.baidu.com/fenlei/%E6%94%BF%E6%B2%BB%E4%BA%BA%E7%89%A9?limit=30&index=1&offset=0#gotoList") 39 //开启1个线程抓取 40 .thread(1) 41 //启动爬虫 42 .run(); 43 } 44 45 //是否列表页 46 public boolean isListPage(Page page){ 47 String url=page.getUrl().toString(); 48 if(!url.contains("fenlei")) 49 { 50 return false; 51 }else{ 52 return true; 53 } 54 } 55 56 public void dealPage(Page page){ 57 String url = page.getUrl().toString(); 58 59 //判断是否为列表页 60 if(isListPage(page)){ 61 if(currentPage < totalPage){ 62 //加入下一个列表页 63 String nexturl="http://baike.baidu.com/fenlei/%E6%94%BF%E6%B2%BB%E4%BA%BA%E7%89%A9?limit=30&index="+(currentPage+1)+"&offset=0#gotoList"; 64 Request request=new Request(nexturl/*.replace(" ","%20")*/); 65 66 page.addTargetRequest(request); 67 currentPage++; 68 69 //加入content页 70 List<String> urls=page.getHtml().xpath("//div[@class='grid-list']/ul/li/div[@class='list']").links().all(); 71 72 for(String content_url:urls){ 73 Request request2=new Request(content_url/*.replace(" ","%20")*/); 74 request2.setPriority(10000); 75 page.addTargetRequest(request2); 76 } 77 } 78 }else{ 79 //处理content页面,这里只是简单的打印出,标题以及部分简短的简介 80 try { 81 String rawHtml=page.getRawText(); 82 JXDocument document=new JXDocument(rawHtml); 83 84 List<Object> title = document.sel("//div[@class='content']//div[@class='main-content']/dl/dd/h1/text()"); 85 List<Object> description = document.sel("//div[@class='main-content']/div[@class='lemma-summary']/div[@class='para']/text()"); 86 87 if(title.size()>0) System.out.println(title.get(0)+":"); 88 if(description.size()>0) System.out.println(description.get(0)); 89 90 91 } catch (XpathSyntaxErrorException e) { 92 e.printStackTrace(); 93 } 94 } 95 } 96 97 }
添加依赖(maven):
1 <dependency> 2 <groupId>us.codecraft</groupId> 3 <artifactId>webmagic-core</artifactId> 4 <version>0.4.2</version> 5 </dependency> 6 <dependency> 7 <groupId>us.codecraft</groupId> 8 <artifactId>webmagic-extension</artifactId> 9 <version>0.4.2</version> 10 </dependency> 11 <dependency> 12 <groupId>cn.wanghaomiao</groupId> 13 <artifactId>JsoupXpath</artifactId> 14 <version>0.3.2</version> 15 </dependency>
其中,
列表页:
content页: