java 使用webmagic 爬蟲框架爬取博客園數據存入數據庫
學習記錄
webmagic簡介:
WebMagic是一個簡單靈活的Java爬蟲框架。你可以快速開發出一個高效、易維護的爬蟲。
准備工作:
Maven依賴(我這里用的Maven創建的web項目做測試):
<dependencies> <!-- junit --> <dependency> <groupId>junit</groupId> <artifactId>junit</artifactId> <version>4.11</version> <scope>test</scope> </dependency> <!--日志配置 --> <dependency> <groupId>org.slf4j</groupId> <artifactId>slf4j-api</artifactId> <version>1.7.12</version> </dependency> <dependency> <groupId>ch.qos.logback</groupId> <artifactId>logback-core</artifactId> <version>1.2.3</version> </dependency> <!-- 實現slf4j接口並整合 --> <dependency> <groupId>ch.qos.logback</groupId> <artifactId>logback-classic</artifactId> <version>1.2.3</version> </dependency> <!-- 數據庫部分 --> <dependency> <groupId>mysql</groupId> <artifactId>mysql-connector-java</artifactId> <version>5.1.34</version> <scope>runtime</scope> </dependency> <!-- c3p0連接池 --> <dependency> <groupId>c3p0</groupId> <artifactId>c3p0</artifactId> <version>0.9.1.2</version> </dependency> <!-- dao框架:mybatis --> <dependency> <groupId>org.mybatis</groupId> <artifactId>mybatis</artifactId> <version>3.4.0</version> </dependency> <!-- mybatis 整合spring --> <dependency> <groupId>org.mybatis</groupId> <artifactId>mybatis-spring</artifactId> <version>1.3.0</version> </dependency> <!-- servlet web依賴 --> <dependency> <groupId>taglibs</groupId> <artifactId>standard</artifactId> <version>1.1.2</version> </dependency> <dependency> <groupId>jstl</groupId> <artifactId>jstl</artifactId> <version>1.2</version> </dependency> <dependency> <groupId>com.fasterxml.jackson.core</groupId> <artifactId>jackson-databind</artifactId> <version>2.5.1</version> </dependency> <dependency> <groupId>javax.servlet</groupId> <artifactId>javax.servlet-api</artifactId> <version>3.1.0</version> </dependency> <!-- spring 依賴 --> <!-- 1.spring核心依賴 --> <dependency> <groupId>org.springframework</groupId> <artifactId>spring-core</artifactId> <version>4.2.5.RELEASE</version> </dependency> <dependency> <groupId>org.springframework</groupId> <artifactId>spring-beans</artifactId> <version>4.2.5.RELEASE</version> </dependency> <dependency> <groupId>org.springframework</groupId> <artifactId>spring-context</artifactId> <version>4.2.5.RELEASE</version> </dependency> <!-- 2.spring dao 層依賴 --> <dependency> <groupId>org.springframework</groupId> <artifactId>spring-jdbc</artifactId> <version>4.2.5.RELEASE</version> </dependency> <dependency> <groupId>org.springframework</groupId> <artifactId>spring-tx</artifactId> <version>4.2.5.RELEASE</version> </dependency> <!-- spring web --> <dependency> <groupId>org.springframework</groupId> <artifactId>spring-web</artifactId> <version>4.2.5.RELEASE</version> </dependency> <dependency> <groupId>org.springframework</groupId> <artifactId>spring-webmvc</artifactId> <version>4.2.5.RELEASE</version> </dependency> <!-- spring test 依賴 --> <dependency> <groupId>org.springframework</groupId> <artifactId>spring-test</artifactId> <version>4.2.6.RELEASE</version> </dependency> <!-- webmagic 網絡爬蟲jar --> <dependency> <groupId>us.codecraft</groupId> <artifactId>webmagic-extension</artifactId> <version>0.7.3</version> </dependency> </dependencies>
數據庫表SQL:
CREATE TABLE `Boke` ( `id` int(11) NOT NULL AUTO_INCREMENT COMMENT 'id', `title` varchar(255) DEFAULT NULL COMMENT '標題', `linke` varchar(255) DEFAULT NULL COMMENT '正文地址', `author` varchar(255) DEFAULT NULL COMMENT '作者', `authorUrl` varchar(255) DEFAULT NULL COMMENT '作者主頁', `summary` varchar(1000) DEFAULT NULL COMMENT '簡介', PRIMARY KEY (`id`) ) ENGINE=InnoDB AUTO_INCREMENT=4890 DEFAULT CHARSET=utf8;
數據庫鏈接工具類:
import java.sql.DriverManager; import java.sql.SQLException; import com.mysql.jdbc.Connection; public class MySqlJdbcUtils { private static String driver = "com.mysql.jdbc.Driver"; private static String url = "jdbc:mysql://192.168.0.132:3306/xbDB?useUnicode=true&characterEncoding=utf-8"; private static String name="tradingbp"; private static String pwd="123456"; /** * * 獲取鏈接 * * @date 2017年8月31日 * @return */ public static Connection getOpenConnection(){ Connection conn= null; try { //加載驅動 Class.forName(driver); conn=(Connection) DriverManager.getConnection(url, name, pwd); System.out.println("獲得數據庫鏈接"); } catch (ClassNotFoundException e) { e.printStackTrace(); }catch (SQLException e) { e.printStackTrace(); } return conn; } public static void main(String[] args) { getOpenConnection(); } }
實體類:
/** * *java 博客實體 * * @date 2017年8月24日 * @see [相關類/方法] * @since [產品/模塊版本] */ public class JavaBokeModel { //標題 private String title; //鏈接地址 private String linke; //作者 private String author; //作者主頁地址 private String authorUrl; //簡介 private String summary; public String getSummary() { return summary; } public void setSummary(String summary) { this.summary = summary; } public String getTitle() { return title; } public void setTitle(String title) { this.title = title; } public String getLinke() { return linke; } public void setLinke(String linke) { this.linke = linke; } public String getAuthor() { return author; } public void setAuthor(String author) { this.author = author; } public String getAuthorUrl() { return authorUrl; } public void setAuthorUrl(String authorUrl) { this.authorUrl = authorUrl; } }
webmagic 框架爬取數據並保存
import java.sql.PreparedStatement; import java.sql.SQLException; import java.util.ArrayList; import java.util.Date; import java.util.List; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Spider; import us.codecraft.webmagic.processor.PageProcessor; import com.mysql.jdbc.Connection; import com.nio.webmagic.jdbc.MySqlJdbcUtils; import com.nio.webmagic.model.JavaBokeModel; /** * * 爬蟲 * * @version [VCES V201R001, 2017年10月12日] * * @see 方法實現 PageProcessor * @since [產品/模塊版本] */ public class JavaBoKePageProcessor implements PageProcessor { private static Connection conn=null; private static PreparedStatement ps =null; //標題和鏈接獲取 private static String TITLEQUERY="div.post_item_body h3 a.titlelnk"; //作者 private static String AUTHORQUERY="div.post_item_foot a.lightblue "; //簡介 private static String SUMMARYQUERY="div.post_item_body p.post_item_summary"; //插入sql語句 private static String insertSql ="INSERT INTO Boke (title,linke,author,authorUrl,summary)VALUES(?,?,?,?,?)"; //初始鏈接 private static Connection getConnection(){ if (conn==null) { conn = MySqlJdbcUtils.getOpenConnection(); } return conn; } /** * * insert操作 * * @date 2017年8月31日 * @return */ private synchronized void insertDb(List<JavaBokeModel> javaBokes){ try { ps = conn.prepareStatement(insertSql); for (JavaBokeModel javaBoke:javaBokes) { ps.setString(1, javaBoke.getTitle().toString()); ps.setString(2, javaBoke.getLinke().toString()); ps.setString(3, javaBoke.getAuthor().toString()); ps.setString(4, javaBoke.getAuthorUrl().toString()); ps.setString(5, javaBoke.getSummary().toString()); ps.executeUpdate(); } } catch (SQLException e) { // TODO Auto-generated catch block e.printStackTrace(); } } //初始化帶爬取網頁地址 private static List<String> urls(){ List<String> listUrl =new ArrayList<String>(); for (int i = 2; i <=200; i++) { //listUrl.add("http://www.cnblogs.com/cate/java/"+i); listUrl.add("http://www.cnblogs.com/cate/java/"+i); } listUrl.toArray(new String[listUrl.size()]); return listUrl; } /** * * jsoup根據 html 字符串和語法獲取內容; * @date 2017年8月31日 * @param htmlText * @return */ private static String seletDocumentText(String htmlText,String Query){ Document doc = Jsoup.parse(htmlText); String select = doc.select(Query).text(); return select; } /** * * jsoup根據 html 字符串和語法獲取鏈接地址; * @date 2017年8月31日 * @param htmlText * @return */ private static String seletDocumentLink(String htmlText,String Query){ Document doc = Jsoup.parse(htmlText); String select = doc.select(Query).attr("href"); return select; } /** * process是定制爬蟲邏輯的核心接口,在這里編寫抽取邏輯 * @see us.codecraft.webmagic.processor.PageProcessor#process(us.codecraft.webmagic.Page) */ @Override public void process(Page page) { // page.addTargetRequests(urls()); //div[@class='post_item']//div[@class='post_item_body']//h3//a[@class='titlelnk']/text()' // 定義如何抽取頁面信息,並保存下來 List<String> htmls =page.getHtml().xpath("//div[@class='post_item']/html()").all(); List<JavaBokeModel> javaBokes=new ArrayList<JavaBokeModel>(); for (String html:htmls) { JavaBokeModel javaBoke =new JavaBokeModel(); //標題和鏈接 String title =seletDocumentText(html,TITLEQUERY); String linke =seletDocumentLink(html,TITLEQUERY); //作者和作者主頁 String author=seletDocumentText(html, AUTHORQUERY); String authorUrl=seletDocumentLink(html, AUTHORQUERY); //簡介 String summary=seletDocumentText(html, SUMMARYQUERY); javaBoke.setTitle(title); javaBoke.setAuthor(author); javaBoke.setAuthorUrl(authorUrl); javaBoke.setLinke(linke); javaBoke.setSummary(summary); javaBokes.add(javaBoke); } insertDb(javaBokes); } @Override public Site getSite() { //抓去網站的相關配置包括:編碼、重試次數、抓取間隔 return Site.me().setSleepTime(1000).setRetryTimes(10); } public static void main(String[] args) { long startTime ,endTime; System.out.println("========小爬蟲【啟動】嘍!========="); getConnection(); startTime = new Date().getTime(); //入口 Spider create = Spider.create(new JavaBoKePageProcessor()); //定義入口地址 create.addUrl("http://www.cnblogs.com/cate/java/").thread(5).run(); try { ps.close(); conn.close(); } catch (Exception e) { // TODO: handle exception } endTime = new Date().getTime(); System.out.println("========小爬蟲【結束】嘍!========="); System.out.println("用時為:"+(endTime-startTime)/1000+"s"); } }
數據:
