java 使用webmagic 爬蟲框架爬取博客園數據


                           java 使用webmagic 爬蟲框架爬取博客園數據存入數據庫

學習記錄  

 

webmagic簡介:

    WebMagic是一個簡單靈活的Java爬蟲框架。你可以快速開發出一個高效、易維護的爬蟲。

    http://webmagic.io/

 

准備工作:

  

  Maven依賴(我這里用的Maven創建的web項目做測試):    

<dependencies>
<!-- junit -->
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.11</version>
<scope>test</scope>
</dependency>

<!--日志配置 -->
<dependency>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-api</artifactId>
<version>1.7.12</version>
</dependency>

<dependency>
<groupId>ch.qos.logback</groupId>
<artifactId>logback-core</artifactId>
<version>1.2.3</version>
</dependency>
<!-- 實現slf4j接口並整合 -->
<dependency>
<groupId>ch.qos.logback</groupId>
<artifactId>logback-classic</artifactId>
<version>1.2.3</version>
</dependency>

<!-- 數據庫部分 -->
<dependency>
<groupId>mysql</groupId>
<artifactId>mysql-connector-java</artifactId>
<version>5.1.34</version>
<scope>runtime</scope>
</dependency>
<!-- c3p0連接池 -->
<dependency>
<groupId>c3p0</groupId>
<artifactId>c3p0</artifactId>
<version>0.9.1.2</version>
</dependency>

<!-- dao框架:mybatis -->
<dependency>
<groupId>org.mybatis</groupId>
<artifactId>mybatis</artifactId>
<version>3.4.0</version>
</dependency>
<!-- mybatis 整合spring -->
<dependency>
<groupId>org.mybatis</groupId>
<artifactId>mybatis-spring</artifactId>
<version>1.3.0</version>
</dependency>

<!-- servlet web依賴 -->
<dependency>
<groupId>taglibs</groupId>
<artifactId>standard</artifactId>
<version>1.1.2</version>
</dependency>
<dependency>
<groupId>jstl</groupId>
<artifactId>jstl</artifactId>
<version>1.2</version>
</dependency>
<dependency>
<groupId>com.fasterxml.jackson.core</groupId>
<artifactId>jackson-databind</artifactId>
<version>2.5.1</version>
</dependency>
<dependency>
<groupId>javax.servlet</groupId>
<artifactId>javax.servlet-api</artifactId>
<version>3.1.0</version>
</dependency>

<!-- spring 依賴 -->
<!-- 1.spring核心依賴 -->
<dependency>
<groupId>org.springframework</groupId>
<artifactId>spring-core</artifactId>
<version>4.2.5.RELEASE</version>
</dependency>
<dependency>
<groupId>org.springframework</groupId>
<artifactId>spring-beans</artifactId>
<version>4.2.5.RELEASE</version>
</dependency>
<dependency>
<groupId>org.springframework</groupId>
<artifactId>spring-context</artifactId>
<version>4.2.5.RELEASE</version>
</dependency>
<!-- 2.spring dao 層依賴 -->
<dependency>
<groupId>org.springframework</groupId>
<artifactId>spring-jdbc</artifactId>
<version>4.2.5.RELEASE</version>
</dependency>
<dependency>
<groupId>org.springframework</groupId>
<artifactId>spring-tx</artifactId>
<version>4.2.5.RELEASE</version>
</dependency>

<!-- spring web -->
<dependency>
<groupId>org.springframework</groupId>
<artifactId>spring-web</artifactId>
<version>4.2.5.RELEASE</version>
</dependency>
<dependency>
<groupId>org.springframework</groupId>
<artifactId>spring-webmvc</artifactId>
<version>4.2.5.RELEASE</version>
</dependency>

<!-- spring test 依賴 -->
<dependency>
<groupId>org.springframework</groupId>
<artifactId>spring-test</artifactId>
<version>4.2.6.RELEASE</version>
</dependency>
<!-- webmagic 網絡爬蟲jar -->
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-extension</artifactId>
<version>0.7.3</version>
</dependency>
</dependencies>
View Code

 

   數據庫表SQL:

CREATE TABLE `Boke` (
  `id` int(11) NOT NULL AUTO_INCREMENT COMMENT 'id',
  `title` varchar(255) DEFAULT NULL COMMENT '標題',
  `linke` varchar(255) DEFAULT NULL COMMENT '正文地址',
  `author` varchar(255) DEFAULT NULL COMMENT '作者',
  `authorUrl` varchar(255) DEFAULT NULL COMMENT '作者主頁',
  `summary` varchar(1000) DEFAULT NULL COMMENT '簡介',
  PRIMARY KEY (`id`)
) ENGINE=InnoDB AUTO_INCREMENT=4890 DEFAULT CHARSET=utf8;

 

 數據庫鏈接工具類:

import java.sql.DriverManager;
import java.sql.SQLException;

import com.mysql.jdbc.Connection;

public class MySqlJdbcUtils {

    private static String driver = "com.mysql.jdbc.Driver";
    private static String url = "jdbc:mysql://192.168.0.132:3306/xbDB?useUnicode=true&characterEncoding=utf-8";
    private static String name="tradingbp";
    private static String pwd="123456";
    
    /**
     * 
     * 獲取鏈接
     *
     * @date   2017年8月31日
     * @return
     */
    public static Connection getOpenConnection(){
        Connection conn= null;
        try {
            //加載驅動
            Class.forName(driver);
            conn=(Connection) DriverManager.getConnection(url, name, pwd);
            System.out.println("獲得數據庫鏈接");
        } catch (ClassNotFoundException  e) {
             e.printStackTrace();
        }catch (SQLException e) {
            e.printStackTrace();
        }
        return conn;
    }
    
    public static void main(String[] args) {
        getOpenConnection();
    }
    
}
View Code

 

 實體類:

/**
 * 
 *java 博客實體
 *
 * @date   2017年8月24日
 * @see  [相關類/方法]
 * @since  [產品/模塊版本]
 */
public class JavaBokeModel {

    //標題
    private String title;
    
    //鏈接地址
    private String linke;
    
    //作者
    private String author;
    
    //作者主頁地址
    private String authorUrl;
    
    //簡介
    private String summary;

    
    public String getSummary() {
        return summary;
    }

    public void setSummary(String summary) {
        this.summary = summary;
    }

    public String getTitle() {
        return title;
    }

    public void setTitle(String title) {
        this.title = title;
    }

    public String getLinke() {
        return linke;
    }

    public void setLinke(String linke) {
        this.linke = linke;
    }

    public String getAuthor() {
        return author;
    }

    public void setAuthor(String author) {
        this.author = author;
    }

    public String getAuthorUrl() {
        return authorUrl;
    }

    public void setAuthorUrl(String authorUrl) {
        this.authorUrl = authorUrl;
    }
    
    
}
View Code

 

webmagic 框架爬取數據並保存

   

import java.sql.PreparedStatement;
import java.sql.SQLException;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;

import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.processor.PageProcessor;

import com.mysql.jdbc.Connection;
import com.nio.webmagic.jdbc.MySqlJdbcUtils;
import com.nio.webmagic.model.JavaBokeModel;
/**
 * 
 * 爬蟲
 *
 * @version  [VCES V201R001, 2017年10月12日]
 *
 * @see 方法實現 PageProcessor 
 * @since  [產品/模塊版本]
 */
public class JavaBoKePageProcessor implements PageProcessor {
    private static Connection conn=null;
    private static PreparedStatement ps =null;
    //標題和鏈接獲取
    private static String  TITLEQUERY="div.post_item_body h3 a.titlelnk";
    //作者
    private static String AUTHORQUERY="div.post_item_foot a.lightblue ";
    //簡介
    private static String SUMMARYQUERY="div.post_item_body p.post_item_summary";
    //插入sql語句
    private static String insertSql ="INSERT INTO Boke (title,linke,author,authorUrl,summary)VALUES(?,?,?,?,?)";
    
    //初始鏈接
    private static Connection getConnection(){
        if (conn==null) {
            conn = MySqlJdbcUtils.getOpenConnection();
        }
        return conn;
    }
    
    /**
     * 
     * insert操作
     *
     * @date   2017年8月31日
     * @return
     */
    
    private synchronized void insertDb(List<JavaBokeModel> javaBokes){
        try {
                
             ps = conn.prepareStatement(insertSql);
            
            for (JavaBokeModel javaBoke:javaBokes) {
                ps.setString(1, javaBoke.getTitle().toString());
                ps.setString(2, javaBoke.getLinke().toString());
                ps.setString(3, javaBoke.getAuthor().toString());
                ps.setString(4, javaBoke.getAuthorUrl().toString());
                ps.setString(5, javaBoke.getSummary().toString());
                ps.executeUpdate();
            }
        } catch (SQLException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }
    }
    
    //初始化帶爬取網頁地址
    private static List<String> urls(){
        List<String> listUrl =new ArrayList<String>();
        
        for (int i = 2; i <=200; i++) {
            
            //listUrl.add("http://www.cnblogs.com/cate/java/"+i);
                listUrl.add("http://www.cnblogs.com/cate/java/"+i);
        }
        listUrl.toArray(new String[listUrl.size()]);
        return listUrl;
    }
    
    /**
     * 
     * jsoup根據 html 字符串和語法獲取內容;
     * @date   2017年8月31日
     * @param htmlText
     * @return
     */
    private static String seletDocumentText(String htmlText,String Query){
        Document doc = Jsoup.parse(htmlText);
        String select = doc.select(Query).text();
        return select;
    }
    
    /**
     * 
     * jsoup根據 html 字符串和語法獲取鏈接地址;
    
     * @date   2017年8月31日
     * @param htmlText
     * @return
     */
    private static String seletDocumentLink(String htmlText,String Query){
        Document doc = Jsoup.parse(htmlText);
        String select = doc.select(Query).attr("href");
        return select;
    }
    /**
     *    process是定制爬蟲邏輯的核心接口,在這里編寫抽取邏輯
     * @see us.codecraft.webmagic.processor.PageProcessor#process(us.codecraft.webmagic.Page)
     */
    @Override
    public void process(Page page) {
        //
        page.addTargetRequests(urls());
        //div[@class='post_item']//div[@class='post_item_body']//h3//a[@class='titlelnk']/text()'
        // 定義如何抽取頁面信息,並保存下來
        List<String> htmls =page.getHtml().xpath("//div[@class='post_item']/html()").all();
        List<JavaBokeModel> javaBokes=new ArrayList<JavaBokeModel>();
        for (String html:htmls) {
            JavaBokeModel javaBoke =new JavaBokeModel();
            //標題和鏈接
            String title =seletDocumentText(html,TITLEQUERY);
            String linke =seletDocumentLink(html,TITLEQUERY);
            //作者和作者主頁
            String author=seletDocumentText(html, AUTHORQUERY);
            String authorUrl=seletDocumentLink(html, AUTHORQUERY);
            //簡介
            String summary=seletDocumentText(html, SUMMARYQUERY);
            javaBoke.setTitle(title);
            javaBoke.setAuthor(author);
            javaBoke.setAuthorUrl(authorUrl);
            javaBoke.setLinke(linke);
            javaBoke.setSummary(summary);
            javaBokes.add(javaBoke);
            
        }
        insertDb(javaBokes);
        
    }

    @Override
    public Site getSite() {
        //抓去網站的相關配置包括:編碼、重試次數、抓取間隔
        return Site.me().setSleepTime(1000).setRetryTimes(10);
    }
    
    public static void main(String[] args) {
        long startTime ,endTime;
        System.out.println("========小爬蟲【啟動】嘍!=========");
        getConnection();
        startTime = new Date().getTime();
        //入口
        Spider create = Spider.create(new JavaBoKePageProcessor());
        //定義入口地址
        create.addUrl("http://www.cnblogs.com/cate/java/").thread(5).run(); 
        try {
            ps.close();
            conn.close();
        } catch (Exception e) {
            // TODO: handle exception
        }
        endTime = new Date().getTime();
        System.out.println("========小爬蟲【結束】嘍!=========");
        System.out.println("用時為:"+(endTime-startTime)/1000+"s");
    }

}

 

數據:

 

 

 

 

 

 

 


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM