2020年寒假假期總結0116


  Jsoup實戰爬取(京東手機數據)

  相關依賴和配置說明已經在上一篇隨筆中交代:https://www.cnblogs.com/heiyang/p/12199113.html

  新建關於手機商品的POJO(JavaBean的一種):Item.class

@Entity
@Table(name = "jd_item")//與數據庫中的jd_item對應上
public class Item {
    //主鍵
    @Id
    @GeneratedValue(strategy = GenerationType.IDENTITY)
    private Long id;
    //標准產品單位(商品集合)
    private Long spu;
    //庫存量單位(最小品類單元)
    private Long sku;
    //商品標題
    private String title;
    //商品價格
    private Double price;
    //商品圖片
    private String pic;
    //商品詳情地址
    private String url;

    public Long getId() {
        return id;
    }

    public void setId(Long id) {
        this.id = id;
    }

    public Long getSpu() {
        return spu;
    }

    public void setSpu(Long spu) {
        this.spu = spu;
    }

    public Long getSku() {
        return sku;
    }

    public void setSku(Long sku) {
        this.sku = sku;
    }

    public String getTitle() {
        return title;
    }

    public void setTitle(String title) {
        this.title = title;
    }

    public Double getPrice() {
        return price;
    }

    public void setPrice(Double price) {
        this.price = price;
    }

    public String getPic() {
        return pic;
    }

    public void setPic(String pic) {
        this.pic = pic;
    }

    public String getUrl() {
        return url;
    }

    public void setUrl(String url) {
        this.url = url;
    }

    public Date getCreated() {
        return created;
    }

    public void setCreated(Date created) {
        this.created = created;
    }

    public Date getUpdated() {
        return updated;
    }

    public void setUpdated(Date updated) {
        this.updated = updated;
    }

    //創建時間
    private Date created;
    //更新時間
    private Date updated;

}

  保存數據到數據庫的接口:ItemServiceImpl

public interface ItemServiceImpl {
    /**
     * 保存獲取的Item類
     * @param item
     */
    public void save(Item item);

    /**
     * 查詢是否已經保存該Item
     * @param item
     * @return
     */
    public List<Item> findAll(Item item);

}

  實現接口的類:ItemService

@Service
public class ItemService implements ItemServiceImpl {

    @Autowired
    private ItemDao itemDao;

    @Override
    public void save(Item item) {
        this.itemDao.save(item);
    }

    @Override
    public List<Item> findAll(Item item) {
        //聲明查詢條件
        Example<Item> example=Example.of(item);
        //依據查詢條件進行查詢數據
        List<Item> list =this.itemDao.findAll(example);

        return list;
    }
}

  操作數據庫的接口,繼承JpaRepository:ItemDao

public interface ItemDao extends JpaRepository <Item,Long> {
      //此處只需要繼承,使用父類的方法就好  
}

  HttpUtils工具類:HttpUtils

@Component
public class HttpUtils {
    private PoolingHttpClientConnectionManager cm;

    public HttpUtils() {
        this.cm = new PoolingHttpClientConnectionManager();
        //    設置最大連接數
        cm.setMaxTotal(200);
        //    設置每個主機的並發數
        cm.setDefaultMaxPerRoute(20);

    }

    /**
     * 依據請求的地址下載網頁數據
     *
     * @param url
     * @return
     */
    public String doGetHtml(String url) {



        // 獲取HttpClient對象
        CloseableHttpClient httpClient = HttpClients.custom().setConnectionManager(cm).build();

        // 聲明httpGet請求對象
        HttpGet httpGet = new HttpGet(url);
        // 設置請求參數RequestConfig
        httpGet.setHeader("User-Agent", "Mozilla/5.0 (Windows NT 6.1; rv:6.0.2) Gecko/20100101 Firefox/6.0.2");
        httpGet.setHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8");
        httpGet.setHeader("Accept-Language", "en-US,en;q=0.8,zh-CN;q=0.5,zh;q=0.3");
        httpGet.setHeader("Referer", "https://www.jd.com/");
        httpGet.setHeader("DNT","1");
        httpGet.setHeader("Connection","keep-alive");
        httpGet.setHeader("Upgrade-Insecure-Requests", "1");
        httpGet.setHeader("TE", "Trailers");
        httpGet.setConfig(this.getConfig());

        CloseableHttpResponse response = null;
        try {
            // 使用HttpClient發起請求,返回response
            response = httpClient.execute(httpGet);
            // 解析response返回數據
            if (response.getStatusLine().getStatusCode() == 200) {
                String html = "";

                // 如果response。getEntity獲取的結果是空,在執行EntityUtils.toString會報錯
                // 需要對Entity進行非空的判斷
                if (response.getEntity() != null) {
                    html = EntityUtils.toString(response.getEntity(), "UTF-8");
                    System.out.println(html);
                }

                return html;
            }

        } catch (Exception e) {
            e.printStackTrace();
        } finally {
            try {
                if (response != null) {
                    // 關閉連接
                    response.close();
                }
                // 不能關閉,現在使用的是連接管理器
                // httpClient.close();
            } catch (Exception e) {
                e.printStackTrace();
            }
        }
        return "";

    }

    public String doGetImage(String url) {
        // 獲取HttpClient對象
        CloseableHttpClient httpClient = HttpClients.custom().setConnectionManager(cm).build();

        // 聲明httpGet請求對象
        HttpGet httpGet = new HttpGet(url);
        // 設置請求參數RequestConfig
        httpGet.setConfig(this.getConfig());

        CloseableHttpResponse response = null;
        try {
            // 使用HttpClient發起請求,返回response
            response = httpClient.execute(httpGet);
            // 解析response下載圖片
            if (response.getStatusLine().getStatusCode() == 200) {
                // 獲取文件類型
                String extName = url.substring(url.lastIndexOf("."));
                // 使用uuid生成圖片名
                String imageName = UUID.randomUUID().toString() + extName;

                // 聲明輸出的文件
                OutputStream outstream = new FileOutputStream(new File("E:/images/" + imageName));
                // 使用響應體輸出文件
                response.getEntity().writeTo(outstream);

                // 返回生成的圖片名
                return imageName;
            }

        } catch (Exception e) {
            e.printStackTrace();
        } finally {
            try {
                if (response != null) {
                    // 關閉連接
                    response.close();
                }
                // 不能關閉,現在使用的是連接管理器
                // httpClient.close();
            } catch (Exception e) {
                e.printStackTrace();
            }
        }

        return null;

    }

    //獲取內容
    public String getHtml(String url) {
        // 獲取HttpClient對象
        CloseableHttpClient httpClient = HttpClients.custom().setConnectionManager(cm).build();

        // 聲明httpGet請求對象
        HttpGet httpGet = new HttpGet(url);
        // 設置請求參數RequestConfig
        httpGet.setConfig(this.getConfig());

        CloseableHttpResponse response = null;
        try {
            // 使用HttpClient發起請求,返回response
            response = httpClient.execute(httpGet);
            // 解析response返回數據
            if (response.getStatusLine().getStatusCode() == 200) {
                String html = "";

                // 如果response。getEntity獲取的結果是空,在執行EntityUtils.toString會報錯
                // 需要對Entity進行非空的判斷
                if (response.getEntity() != null) {
                    html = EntityUtils.toString(response.getEntity(), "UTF-8");
                }

                return html;
            }

        } catch (Exception e) {
            e.printStackTrace();
        } finally {
            try {
                if (response != null) {
                    // 關閉連接
                    response.close();
                }
                // 不能關閉,現在使用的是連接管理器
                // httpClient.close();
            } catch (Exception e) {
                e.printStackTrace();
            }
        }

        return null;
    }

    //獲取圖片
    public String getImage(String url) {
        // 獲取HttpClient對象
        CloseableHttpClient httpClient = HttpClients.custom().setConnectionManager(cm).build();

        // 聲明httpGet請求對象
        HttpGet httpGet = new HttpGet(url);
        // 設置請求參數RequestConfig
        httpGet.setConfig(this.getConfig());

        CloseableHttpResponse response = null;
        try {
            // 使用HttpClient發起請求,返回response
            response = httpClient.execute(httpGet);
            // 解析response下載圖片
            if (response.getStatusLine().getStatusCode() == 200) {
                // 獲取文件類型
                String extName = url.substring(url.lastIndexOf("."));
                // 使用uuid生成圖片名
                String imageName = UUID.randomUUID().toString() + extName;

                // 聲明輸出的文件
                OutputStream outstream = new FileOutputStream(new File("D:/images/" + imageName));
                // 使用響應體輸出文件
                response.getEntity().writeTo(outstream);

                // 返回生成的圖片名
                return imageName;
            }

        } catch (Exception e) {
            e.printStackTrace();
        } finally {
            try {
                if (response != null) {
                    // 關閉連接
                    response.close();
                }
                // 不能關閉,現在使用的是連接管理器
                // httpClient.close();
            } catch (Exception e) {
                e.printStackTrace();
            }
        }

        return null;
    }


    //獲取請求參數對象
    private RequestConfig getConfig() {
        RequestConfig config = RequestConfig.custom().setConnectTimeout(1000)// 設置創建連接的超時時間
                .setConnectionRequestTimeout(500) // 設置獲取連接的超時時間
                .setSocketTimeout(10000) // 設置連接的超時時間
                .build();

        return config;
    }

}

  工作任務類(爬取任務):ItemTask

@Component
public class ItemTask {

    @Autowired
    private HttpUtils httpUtils;
    @Autowired
    private ItemService itemService;

    private static final ObjectMapper MAPPER =  new ObjectMapper();


    //當下載任務完成后,間隔多長時間進行下一次的任務。
    @Scheduled(fixedDelay = 100 * 1000)
    public void itemTask() throws Exception {
        //聲明需要解析的初始地址
        String url = "https://search.jd.com/Search?keyword=%E6%89%8B%E6%9C%BA&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&wq" +
                "=%E6%89%8B%E6%9C%BA&cid2=653&cid3=655&s=113&click=0&page=";

        //按照頁面對手機的搜索結果進行遍歷解析
        for (int i = 1; i < 10; i = i + 2) {
            String html = httpUtils.doGetHtml(url + i);
            //解析頁面,獲取商品數據並存儲
            this.parse(html);
        }


        System.out.println("手機數據抓取完成!");


    }

    //解析頁面,獲取商品數據並存儲
    private void parse(String html) throws Exception {
        //解析html獲取Document
        Document doc = Jsoup.parse(html);

        //獲取spu信息
        Elements spuEles = doc.select("div#J_goodsList > ul > li");
        System.out.println(html);
        System.out.println("-----------------數量為:"+spuEles.size());
        for (Element spuEle : spuEles) {
            //獲取spu
            long spu = Long.parseLong(spuEle.attr("data-spu"));

            //獲取sku信息
            Elements skuEles = spuEle.select("li.ps-item");

            for (Element skuEle : skuEles) {
                //獲取sku
                long sku = Long.parseLong(skuEle.select("[data-sku]").attr("data-sku"));

                System.out.println("商品的Sku值為"+sku);
                //根據sku查詢商品數據
                Item item = new Item();
                item.setSku(sku);
                List<Item> list = this.itemService.findAll(item);

                if(list.size()>0) {
                    //如果商品存在,就進行下一個循環,該商品不保存,因為已存在
                    continue;
                }

                //設置商品的spu
                item.setSpu(spu);

                //獲取商品的詳情的url
                String itemUrl = "https://item.jd.com/" + sku + ".html";
                item.setUrl(itemUrl);


                //獲取商品的圖片
                String picUrl ="https:"+ skuEle.select("img[data-sku]").first().attr("data-lazy-img");
                picUrl = picUrl.replace("/n9/","/n1/");
                String picName = this.httpUtils.doGetImage(picUrl);
                item.setPic(picName);

                System.out.println("商品的圖片地址:"+picName);

                //獲取商品的價格
                String priceJson = this.httpUtils.doGetHtml("https://p.3.cn/prices/mgets?skuIds=J_" + sku);
                double price = MAPPER.readTree(priceJson).get(0).get("p").asDouble();
                item.setPrice(price);
                System.out.println("商品的價格:"+picName);

                //獲取商品的標題
                String itemInfo = this.httpUtils.doGetHtml(item.getUrl());
                String title = Jsoup.parse(itemInfo).select("div.sku-name").text();
                item.setTitle(title);
                System.out.println("商品的標題:"+picName);


                item.setCreated(new Date());
                item.setUpdated(item.getCreated());

                //保存商品數據到數據庫中
                this.itemService.save(item);

            }
        }
    }

}

  最后一步添加引導類:Application

@SpringBootApplication
//使用定時任務,需要先開啟定時任務
@EnableScheduling
public class Application {
    public static void main(String[] args) {
        SpringApplication.run(Application.class,args);
    }
}

  資源文件夾圖:

 

  爬取結果:

 

 

   注意點:自己觀看的視頻發布時間稍微早一點,當時京東還沒有反爬,現在需要加上header就可以了。


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM