使用WebMagic框架爬取京東數據


WebMagic框架介紹:

  WebMagic框架是一個爬蟲框架,其底層是HttpClient和Jsoup。WebMagic的結構分為Downloader、PageProcessor、Scheduler、Pipeline四大組件,並由Spider將它們彼此組織起來。

  WebMagic總體架構圖如下

  

 

爬取京東數據各個組件的流程:

  downloader
      1.判斷獲取到的地址是下一頁地址還是普通地址
      2.若是下一頁地址則獲取附件,用無頭瀏覽模式加載到該地址
      3.再用Selenium操作瀏覽器點擊下一頁
      4.用Selenium操作瀏覽器拉到最下方
      5.吧渲染好的頁面傳給pageProcessor
      6.若是普通地址,也要區分是第一頁地址還是詳情頁面地址
      7.若是詳情頁面則直接傳給pageProcessor
      8.若是第一頁(有列表的頁面則是第一頁),用Selenium操作瀏覽器拉到最下方
      9.傳輸渲染好的頁面傳給pageProcessor
  pageProcessor
      1.判斷是列表頁面還是詳情頁面
      2.如果是列表頁面就獲取所有的地址傳給隊列
      3.傳給隊列一個下一頁地址:http://www.nextPage.com並添加附件,內容為這一頁地址,方便downloader對象點擊下一頁按鈕,
          為了以放隊列刪除相同地址,添加一個 ?url=當前頁地址
      4.獲取列表中每個商品的sku和sdu封裝成實體類集合傳給pipeline(因為詳情頁面不好找;通過詳情頁面的sku即可找到對應的spu)
      5.如果是詳情頁面就封裝成實體類傳入pipeline
  pipeline
      1.如果獲取到列表對象,就保存每個實體類到數據庫
      2.如果獲取到實體類,就憑借實體類sku去數據庫取出對應的一條數據
      3.融合實體類,更新數據庫

  scheduler
      

源碼:

  導入依賴

<parent>
        <groupId>org.springframework.boot</groupId>
        <artifactId>spring-boot-starter-parent</artifactId>
        <version>2.0.2.RELEASE</version>
    </parent>

    <dependencies>
        <!--WebMagic核心包-->
        <dependency>
            <groupId>us.codecraft</groupId>
            <artifactId>webmagic-core</artifactId>
            <version>0.7.3</version>
            <exclusions>
                <exclusion>
                    <groupId>org.slf4j</groupId>
                    <artifactId>slf4j-log4j12</artifactId>
                </exclusion>
            </exclusions>
        </dependency>
        <!--WebMagic擴展-->
        <dependency>
            <groupId>us.codecraft</groupId>
            <artifactId>webmagic-extension</artifactId>
            <version>0.7.3</version>
        </dependency>

        <!--工具包-->
        <dependency>
            <groupId>org.apache.commons</groupId>
            <artifactId>commons-lang3</artifactId>
        </dependency>
        <!--SpringMVC-->
        <dependency>
            <groupId>org.springframework.boot</groupId>
            <artifactId>spring-boot-starter-web</artifactId>
        </dependency>

        <!--SpringData Jpa-->
        <dependency>
            <groupId>org.springframework.boot</groupId>
            <artifactId>spring-boot-starter-data-jpa</artifactId>
        </dependency>

        <!--單元測試-->
        <dependency>
            <groupId>org.springframework.boot</groupId>
            <artifactId>spring-boot-starter-test</artifactId>
        </dependency>

        <!--MySQL連接包-->
        <dependency>
            <groupId>mysql</groupId>
            <artifactId>mysql-connector-java</artifactId>
        </dependency>

        <dependency>
            <groupId>org.seleniumhq.selenium</groupId>
            <artifactId>selenium-java</artifactId>
            <version>3.13.0</version>
        </dependency>

    </dependencies>

  PageProcessor組件

package com.myjava.crawler.component;

import com.myjava.crawler.entity.Item;
import org.apache.http.HttpEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.springframework.stereotype.Component;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.selector.Html;
import us.codecraft.webmagic.selector.Selectable;

import java.io.FileOutputStream;
import java.io.IOException;
import java.util.*;

@Component
public class JdPageProcessor implements PageProcessor {
    @Override
    public void process(Page page) {
        Html html = page.getHtml();
        //1.判斷是列表頁面還是詳情頁面
        List<Selectable> nodes = html.css("ul.gl-warp > li.gl-item").nodes();
        if (nodes.size() > 0) {
            //2.如果是列表頁面就獲取所有的地址傳給隊列
            List<String> hrefs = html.css("div.gl-i-wrap div.p-img a", "href").all();
            page.addTargetRequests(hrefs);
            //4.獲取列表中每個商品的sku和spu封裝成實體類集合傳給pipeline(因為詳情頁面不好找;通過詳情頁面的sku即可找到對應的spu)
            List<Item> itemList = new ArrayList<>();
            Document document = html.getDocument();
            Elements elements = document.select("ul.gl-warp.clearfix li.gl-item");
            for (Element element : elements) {
                String sku = element.attr("data-sku");
                String spu = element.attr("data-spu");
                Item item = new Item();
                item.setSku(Long.parseLong(sku));
                item.setSpu(Long.parseLong(spu));
                itemList.add(item);
            }
            page.putField("itemList",itemList);
            //3.傳給隊列一個下一頁地址:http://www.nextPage.com並添加附件,內容為這一頁地址,
            //     方便downloader對象點擊下一頁按鈕,為了以放隊列刪除相同地址,添加一個 ?url=當前頁地址
            Request request = new Request("http://nextpage?url="+page.getUrl());
            Map<String,Object> map = new HashMap<>();
            map.put("currentPageUrl",page.getUrl().get());
            request.setExtras(map);

            page.addTargetRequest(request);




        } else{
            //5.如果是詳情頁面就封裝成實體類傳入pipeline

            //庫存量單位(最小品類單元)
            Long sku = null;
            try {
                sku = Long.parseLong(html.css("div.preview-info div.left-btns a.follow.J-follow","data-id").get());
            } catch (NumberFormatException e) {
                //System.out.println("---------------"+page.getUrl()+"-------------------");
                e.printStackTrace();
            }
            //商品標題
            String title = html.css(".sku-name","text").get();
            //商品價格
            Double price = Double.parseDouble(html.css("div.dd span.p-price span.price","text").get());
            //商品圖片
            String picture = parsePicture(page);
            //商品詳情地址
            String url = page.getUrl().get();
            //創建時間
            Date createDate = new Date();
            //更新時間
            Date updateDate = new Date();

            Item item = new Item();
            item.setSku(sku);
            item.setUpdated(updateDate);
            item.setCreated(createDate);
            item.setUrl(url);
            item.setTitle(title);
            item.setPrice(price);
            item.setPic(picture);
            page.putField("item",item);

        }
    }

    @Override
    public Site getSite() {
        return Site.me()
                //必須設置這個請求頭,不然拿不到數據
                .addHeader("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:73.0) Gecko/20100101 Firefox/73.0");
    }


    /**
     * 保存圖片到本地並返回本地圖片地址
     * @param page
     * @return
     */
    public String parsePicture(Page page){
        String src = "http:"+page.getHtml().css("div#spec-n1.jqzoom.main-img img#spec-img", "src").get();
        PoolingHttpClientConnectionManager hc = new PoolingHttpClientConnectionManager();
        CloseableHttpClient client = HttpClients.custom()
                .setConnectionManager(hc)
                .build();
        HttpGet get = new HttpGet(src);
        get.setHeader("User-Agent","Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:73.0) Gecko/20100101 Firefox/73.0");
        String fileName = UUID.randomUUID()+src.substring(src.lastIndexOf("."));
        String path = "D:\\temp\\img\\";
        String finalPath =path+fileName;
        try {
            CloseableHttpResponse response = client.execute(get);
            HttpEntity entity = response.getEntity();
            entity.writeTo(new FileOutputStream(finalPath));
        } catch (IOException e) {
            e.printStackTrace();
        }
        return finalPath;
    }


}

  Downloader組件

package com.myjava.crawler.component;

import org.openqa.selenium.WebElement;
import org.openqa.selenium.chrome.ChromeDriver;
import org.openqa.selenium.chrome.ChromeOptions;
import org.openqa.selenium.remote.RemoteWebDriver;
import org.springframework.stereotype.Component;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.downloader.Downloader;
import us.codecraft.webmagic.selector.PlainText;

import java.util.List;

@Component
public class JdDownloader implements Downloader {

    private RemoteWebDriver chromeDriver;

    public JdDownloader(){
        //配置參數
        System.setProperty("webdriver.chrome.driver", "C:\\Program Files (x86)\\Google\\Chrome\\Application\\chromedriver.exe");
        ChromeOptions chromeOptions = new ChromeOptions();
//        設置為 headless 模式 (必須)
//          chromeOptions.addArguments("--headless");
//        設置瀏覽器窗口打開大小  (非必須)
        chromeOptions.addArguments("--window-size=1024,768");
        chromeDriver = new ChromeDriver(chromeOptions);
    }

    @Override
    public Page download(Request request, Task task) {
        //1.判斷獲取到的地址是下一頁地址還是普通地址(http://nextpage?url=)
        if (request.getUrl().contains("nextpage")) {
            //2.若是下一頁地址則獲取附件,用無頭瀏覽模式加載到該地址
            String currentPageUrl = (String) request.getExtra("currentPageUrl");
            try {
                Thread.sleep(3000);
            } catch (InterruptedException e) {
                e.printStackTrace();
            }
            chromeDriver.get(currentPageUrl);
            //3.再用Selenium操作瀏覽器點擊下一頁,並休眠3秒鍾
            chromeDriver.findElementByCssSelector("div#J_topPage.f-pager a.fp-next").click();
            try {
                Thread.sleep(2000);
            } catch (InterruptedException e) {
                e.printStackTrace();
            }
            //4.用Selenium操作瀏覽器拉到最下方
            chromeDriver.executeScript("window.scrollTo(0, document.body.scrollHeight - 300)");
            try {
                Thread.sleep(2000);
            } catch (InterruptedException e) {
                e.printStackTrace();
            }
            //取渲染之后的頁面
            String pageSource = chromeDriver.getPageSource();
            //5.吧渲染好的頁面傳給pageProcessor
            return createPage(pageSource,chromeDriver.getCurrentUrl());
        } else {
            //6.若是普通地址,也要區分是第一頁地址還是詳情頁面地址
            chromeDriver.get(request.getUrl());
            List<WebElement> elements = chromeDriver.findElementsByCssSelector(".gl-item");
            if (elements.size()>0) {
                //8.若是第一頁(有列表的頁面則是第一頁),用Selenium操作瀏覽器拉到最下方
                chromeDriver.executeScript("window.scrollTo(0, document.body.scrollHeight - 300)");
                try {
                    Thread.sleep(2000);
                } catch (InterruptedException e) {
                    e.printStackTrace();
                }
                //9.傳輸渲染好的頁面傳給pageProcessor
                return createPage(chromeDriver.getPageSource(),chromeDriver.getCurrentUrl());
            }else {
                //7.若是詳情頁面則直接傳給pageProcessor
                return createPage(chromeDriver.getPageSource(),chromeDriver.getCurrentUrl());
            }
        }
    }

    public Page createPage(String pageSource,String url){
        Page page = new Page();
        //封裝page對象
        page.setRawText(pageSource);
        page.setUrl(new PlainText(url));
        //設置request對象(必要)
        page.setRequest(new Request(url));
        //設置頁面抓取成功(必要)
        page.setDownloadSuccess(true);
        return page;
    }

    @Override
    public void setThread(int i) {

    }
}

  Pipeline組件

package com.myjava.crawler.component;

import com.myjava.crawler.dao.ItemDao;
import com.myjava.crawler.entity.Item;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Component;
import us.codecraft.webmagic.ResultItems;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.pipeline.Pipeline;

import java.util.List;

@Component
public class JdPipeline implements Pipeline {

    @Autowired
    private ItemDao itemDao;

    @Override
    public void process(ResultItems resultItems, Task task) {

        List<Item> itemList = resultItems.get("itemList");
        //1.如果獲取到列表對象,就保存每個實體類到數據庫
        if (itemList!=null){
            for (Item item : itemList) {
                itemDao.save(item);
            }
        }

        Item item = resultItems.get("item");
        if(item!=null) {
            //2.如果獲取到實體類,就憑借實體類sku去數據庫取出對應的一條數據
            Item itemSku = itemDao.findBySku(item.getSku());
            //3.融合實體類,更新數據庫
            itemSku.setPic(item.getPic());
            itemSku.setPrice(item.getPrice());
            itemSku.setTitle(item.getTitle());
            itemSku.setUrl(item.getUrl());
            itemSku.setCreated(item.getCreated());
            itemSku.setUpdated(item.getUpdated());

            itemDao.save(itemSku);
        }

    }
}

  Spider開啟爬蟲

package com.myjava.crawler.component;

import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Component;
import us.codecraft.webmagic.Spider;

@Component
public class JdSpider {

    @Autowired
    private JdPageProcessor pageProcessor;

    @Autowired
    private JdDownloader downloader;

    @Autowired
    private JdPipeline pipeline;

    public void start(){
        Spider.create(pageProcessor)
                .setDownloader(downloader)
                .addPipeline(pipeline)
                .addUrl("https://search.jd.com/Search?keyword=%E6%89%8B%E6%9C%BA&enc=utf-8&pvid=f50e5a02031849e4a9f8adbb9928b7ac")
                .start();
    }
}

 

 

 

爬取數據過程中的問題及解決方法:

  問題:

    在爬取過程中,明明css選擇器沒寫錯卻取不到數據

  解決方法:

    在請求頭中加入  User-Agent   :    Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:73.0) Gecko/20100101 Firefox/73.0  即可解決

 


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM