代码结构共3个,
- 爬取京东手机ID与名称
- 爬取京东手机ID与价格
- 组织json
为啥没合并在一起,原因:其中有个组织价格URL的过程
项目采用maven管理
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> <modelVersion>4.0.0</modelVersion> <groupId>org.study</groupId> <artifactId>WebMagicStudy</artifactId> <version>0.0.1-SNAPSHOT</version> <packaging>jar</packaging> <name>WebMagicStudy</name> <url>http://maven.apache.org</url> <properties> <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> <webmagic.version>0.5.3</webmagic.version> </properties> <dependencies> <dependency> <groupId>com.fasterxml.jackson.core</groupId> <artifactId>jackson-core</artifactId> <version>2.7.3</version> </dependency> <dependency> <groupId>com.fasterxml.jackson.core</groupId> <artifactId>jackson-databind</artifactId> <version>2.7.3</version> </dependency> <dependency> <groupId>us.codecraft</groupId> <artifactId>webmagic-samples</artifactId> <version>0.5.2</version> </dependency> <dependency> <groupId>us.codecraft</groupId> <artifactId>webmagic-selenium</artifactId> <version>0.5.2</version> </dependency> <dependency> <groupId>us.codecraft</groupId> <artifactId>webmagic-core</artifactId> <version>${webmagic.version}</version> </dependency> <dependency> <groupId>us.codecraft</groupId> <artifactId>webmagic-extension</artifactId> <version>${webmagic.version}</version> <exclusions> <exclusion> <groupId>org.slf4j</groupId> <artifactId>slf4j-log4j12</artifactId> </exclusion> </exclusions> </dependency> <dependency> <groupId>junit</groupId> <artifactId>junit</artifactId> <version>3.8.1</version> <scope>test</scope> </dependency> </dependencies> </project>
package org.study.WebMagicStudy; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Set; import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Spider; import us.codecraft.webmagic.pipeline.FilePipeline; import us.codecraft.webmagic.processor.PageProcessor; public class JDAjaxProcessor implements PageProcessor { public static final String URL_LIST = "http://list\\.jd\\.com/list\\.html\\?cat=9987,653,655&page=\\d+\\&go=0\\&JL=6_0_0"; //用于存储{key:手机ID,value:手机名称} static Map<String,String> map = new HashMap<String, String>(); static Set<String> uri = new HashSet<String>(); public static void main(String[] args) { String list = "http://list.jd.com/list.html?cat=9987,653,655&page=1&go=0&JL=6_0_0"; Spider.create(new JDAjaxProcessor()).addUrl(list) .addPipeline(new FilePipeline("D:\\webmagic\\")) .run(); for (String s : map.values()) { System.out.println(s); } System.out.println("map-->" + map.size()); System.out.println(map.get("10274956063")); } private Site site = Site.me().setRetryTimes(3).setSleepTime(100); public Site getSite() { return site; } public void process(Page page) { if (page.getUrl().regex(URL_LIST).match()) { // page.setSkip(true); page.putField("id",page.getHtml().xpath("//div[@class='p-focus']/a/@data-sku").all()); page.putField("name",page.getHtml().xpath("//div[@class='p-name']/a/em/text()").all()); List<String> ids = (List<String>) page.getResultItems().get("id"); List<String> name = (List<String>) page.getResultItems().get("name"); String makerUrl = makerUrl(ids); // System.out.println("价格连接" + makerUrl); //key:id,value:price Map<String, String> running = JDJsonPreocessor.running(makerUrl); for (int i = 0; i < name.size(); i++) { String price = running.get("J_"+ids.get(i)); map.put(ids.get(i), name.get(i) +"\t"+ price); } page.addTargetRequests(page.getHtml().links().regex(URL_LIST).all()); } } public String makerUrl(List<String> ids){ StringBuffer sb = new StringBuffer(); for (String id : ids) { sb.append("J_"+id+","); } String substring = sb.substring(0, sb.length()-1); return "http://p.3.cn/prices/mgets?skuIds="+substring+"&callback=result"; } public void writeFile(){ } }
package org.study.WebMagicStudy; import java.util.HashMap; import java.util.List; import java.util.Map; import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Spider; import us.codecraft.webmagic.processor.PageProcessor; /** * Hello world! * */ public class JDJsonPreocessor implements PageProcessor { static Map<String,String> maps = new HashMap<String, String>(); public static Map<String,String> running(String url) { Spider.create(new JDJsonPreocessor()).addUrl(url).run(); return maps; } private Site site = Site.me().setRetryTimes(3).setSleepTime(100); public Site getSite() { return site; } public void process(Page page) { page.setSkip(true); String text = page.getRawText(); int begin = text.indexOf("["); int end = text.indexOf("]"); String substring = text.substring(begin, end + 1); String jsonName = "result"; String json = "{\"" + jsonName + "\":" + substring + "}"; Map<String, Object> map = JsonUtil.jsonToMap(json); List<Map<String, Object>> list = (List<Map<String, Object>>) map.get(jsonName); for (Map<String, Object> map1 : list) { String key = map1.get("id").toString(); String value = map1.get("p").toString(); maps.put(key, value); } } }
package org.study.WebMagicStudy; import java.util.List; import java.util.Map; import com.fasterxml.jackson.core.JsonProcessingException; import com.fasterxml.jackson.databind.ObjectMapper; /** * json 工具类 * */ public class JsonUtil { /** * 转换json 字符串为map * @param jsonStr * @return */ @SuppressWarnings("unchecked") public static Map<String,Object> jsonToMap(String jsonStr){ ObjectMapper om = new ObjectMapper(); Map<String, Object> map = null; try { map = om.readValue(jsonStr, Map.class); } catch (Exception e) { e.printStackTrace(); } return map; } /** * 将一个map转换成JSON对象 * @param param * @return */ public static String mapToJson(Map<String,Object> m) { ObjectMapper om = new ObjectMapper(); String json=""; try { json = om.writeValueAsString(m); } catch (Exception e) { e.printStackTrace(); } return json; } /** * 将一个list转换成json串,转换后格式 * [{key:'value'},{key:'value'},{key:'value'}...] * @param param * @return */ public static String listToJson(List<Map<String,Object>> list) { ObjectMapper om = new ObjectMapper(); String json = ""; try { json = om.writeValueAsString(list); } catch (JsonProcessingException e) { e.printStackTrace(); } //System.out.println(json); return json; } /** * 将一个array类型的json串转换成list * @param jsonStr * @return */ @SuppressWarnings("unchecked") public static List<Map<String,Object>> jsonToList(String jsonStr){ ObjectMapper om = new ObjectMapper(); List<Map<String, Object>> list = null; try { list = om.readValue(jsonStr, List.class); } catch (Exception e) { e.printStackTrace(); } return list; } }