java爬虫webmagic 案例爬取动态(ajax+js) 网站京东售价格项目


代码结构共3个,

  1. 爬取京东手机ID与名称
  2. 爬取京东手机ID与价格
  3. 组织json

为啥没合并在一起,原因:其中有个组织价格URL的过程

项目采用maven管理

<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
    xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
    <modelVersion>4.0.0</modelVersion>

    <groupId>org.study</groupId>
    <artifactId>WebMagicStudy</artifactId>
    <version>0.0.1-SNAPSHOT</version>
    <packaging>jar</packaging>

    <name>WebMagicStudy</name>
    <url>http://maven.apache.org</url>

    <properties>
        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
        <webmagic.version>0.5.3</webmagic.version>
    </properties>

    <dependencies>
        <dependency>
            <groupId>com.fasterxml.jackson.core</groupId>
            <artifactId>jackson-core</artifactId>
            <version>2.7.3</version>
        </dependency>
        <dependency>
            <groupId>com.fasterxml.jackson.core</groupId>
            <artifactId>jackson-databind</artifactId>
            <version>2.7.3</version>
        </dependency>


        <dependency>
            <groupId>us.codecraft</groupId>
            <artifactId>webmagic-samples</artifactId>
            <version>0.5.2</version>
        </dependency>


        <dependency>
            <groupId>us.codecraft</groupId>
            <artifactId>webmagic-selenium</artifactId>
            <version>0.5.2</version>
        </dependency>

        <dependency>
            <groupId>us.codecraft</groupId>
            <artifactId>webmagic-core</artifactId>
            <version>${webmagic.version}</version>
        </dependency>
        <dependency>
            <groupId>us.codecraft</groupId>
            <artifactId>webmagic-extension</artifactId>
            <version>${webmagic.version}</version>
            <exclusions>
                <exclusion>
                    <groupId>org.slf4j</groupId>
                    <artifactId>slf4j-log4j12</artifactId>
                </exclusion>
            </exclusions>
        </dependency>
        <dependency>
            <groupId>junit</groupId>
            <artifactId>junit</artifactId>
            <version>3.8.1</version>
            <scope>test</scope>
        </dependency>
    </dependencies>
</project>
package org.study.WebMagicStudy;

import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;

import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.pipeline.FilePipeline;
import us.codecraft.webmagic.processor.PageProcessor;


public class JDAjaxProcessor implements PageProcessor {
    public static final String URL_LIST = "http://list\\.jd\\.com/list\\.html\\?cat=9987,653,655&page=\\d+\\&go=0\\&JL=6_0_0";
    //用于存储{key:手机ID,value:手机名称}
    static Map<String,String> map = new HashMap<String, String>();
    static Set<String> uri = new HashSet<String>();
    public static void main(String[] args) {
        String list = "http://list.jd.com/list.html?cat=9987,653,655&page=1&go=0&JL=6_0_0";
        Spider.create(new JDAjaxProcessor()).addUrl(list)
         .addPipeline(new FilePipeline("D:\\webmagic\\"))
        .run();
        for (String s : map.values()) {
            System.out.println(s);
        }
        System.out.println("map-->" + map.size());
        System.out.println(map.get("10274956063"));
    }

    private Site site = Site.me().setRetryTimes(3).setSleepTime(100);

    public Site getSite() {
        return site;
    }

    public void process(Page page) {
        if (page.getUrl().regex(URL_LIST).match()) {
//            page.setSkip(true);
            page.putField("id",page.getHtml().xpath("//div[@class='p-focus']/a/@data-sku").all());
            page.putField("name",page.getHtml().xpath("//div[@class='p-name']/a/em/text()").all());
            List<String> ids = (List<String>) page.getResultItems().get("id");
            List<String> name = (List<String>) page.getResultItems().get("name");
            String makerUrl = makerUrl(ids);
//            System.out.println("价格连接" + makerUrl);
            //key:id,value:price
            Map<String, String> running = JDJsonPreocessor.running(makerUrl);
            for (int i = 0; i < name.size(); i++) {
                String price = running.get("J_"+ids.get(i));
                map.put(ids.get(i), name.get(i) +"\t"+ price);
            }
            
            page.addTargetRequests(page.getHtml().links().regex(URL_LIST).all());
        }
    }
    
    public String makerUrl(List<String> ids){
        StringBuffer sb = new StringBuffer();
        for (String id : ids) {
            sb.append("J_"+id+",");
        }
        String substring = sb.substring(0, sb.length()-1);
        return "http://p.3.cn/prices/mgets?skuIds="+substring+"&callback=result";
    }
    public void writeFile(){
        
    }
}
package org.study.WebMagicStudy;

import java.util.HashMap;
import java.util.List;
import java.util.Map;

import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.processor.PageProcessor;

/**
 * Hello world!
 *
 */
public class JDJsonPreocessor implements PageProcessor {
    static Map<String,String> maps = new HashMap<String, String>();
    
    
    public static Map<String,String> running(String url) {

        Spider.create(new JDJsonPreocessor()).addUrl(url).run();
        return maps;
    }

    private Site site = Site.me().setRetryTimes(3).setSleepTime(100);

    public Site getSite() {
        return site;
    }

    public void process(Page page) {
        page.setSkip(true);
        String text = page.getRawText();
        int begin = text.indexOf("[");
        int end = text.indexOf("]");
        String substring = text.substring(begin, end + 1);
        String jsonName = "result";
        String json = "{\"" + jsonName + "\":" + substring + "}";

        Map<String, Object> map = JsonUtil.jsonToMap(json);
        List<Map<String, Object>> list = (List<Map<String, Object>>) map.get(jsonName);
        for (Map<String, Object> map1 : list) {
            String key = map1.get("id").toString();
            String value = map1.get("p").toString();
            maps.put(key, value);
        }
    }

}
package org.study.WebMagicStudy;


import java.util.List;
import java.util.Map;

import com.fasterxml.jackson.core.JsonProcessingException;
import com.fasterxml.jackson.databind.ObjectMapper;

/**
 * json 工具类
 *
 */
public class JsonUtil { 

    /**
     * 转换json 字符串为map
     * @param jsonStr
     * @return
     */
    @SuppressWarnings("unchecked")
    public static Map<String,Object> jsonToMap(String jsonStr){
        ObjectMapper om = new ObjectMapper();
        Map<String, Object> map = null;
        try {
            map = om.readValue(jsonStr, Map.class);
        } catch (Exception e) {
            e.printStackTrace();
        }
        return map;
    }
    
    /**
     * 将一个map转换成JSON对象
     * @param param
     * @return
     */
    public static String mapToJson(Map<String,Object> m) {
        ObjectMapper om = new ObjectMapper();

        String json="";
        try {
            json = om.writeValueAsString(m);
        } catch (Exception e) {
            e.printStackTrace();
        }
        return json;
    }
    /**
     * 将一个list转换成json串,转换后格式
     * [{key:'value'},{key:'value'},{key:'value'}...]
     * @param param
     * @return
     */
    
    public static String listToJson(List<Map<String,Object>> list) {
        ObjectMapper om = new ObjectMapper();
        String json = "";
        try {
            json = om.writeValueAsString(list);
        } catch (JsonProcessingException e) {
            e.printStackTrace();
        }
        //System.out.println(json);
        return json;
    }
    
    /**
     * 将一个array类型的json串转换成list
     * @param jsonStr
     * @return
     */
    @SuppressWarnings("unchecked")
    public static List<Map<String,Object>> jsonToList(String jsonStr){
        ObjectMapper om = new ObjectMapper();
        List<Map<String, Object>> list = null;
        try {
            list = om.readValue(jsonStr, List.class);
        } catch (Exception e) {
            e.printStackTrace();
        }
        return list;
    }

}

 


免责声明!

本站转载的文章为个人学习借鉴使用,本站对版权不负任何法律责任。如果侵犯了您的隐私权益,请联系本站邮箱yoyou2525@163.com删除。



 
粤ICP备18138465号  © 2018-2025 CODEPRJ.COM