java 使用webmagic 爬虫框架爬取博客园数据存入数据库
学习记录
webmagic简介:
WebMagic是一个简单灵活的Java爬虫框架。你可以快速开发出一个高效、易维护的爬虫。
准备工作:
Maven依赖(我这里用的Maven创建的web项目做测试):

<dependencies> <!-- junit --> <dependency> <groupId>junit</groupId> <artifactId>junit</artifactId> <version>4.11</version> <scope>test</scope> </dependency> <!--日志配置 --> <dependency> <groupId>org.slf4j</groupId> <artifactId>slf4j-api</artifactId> <version>1.7.12</version> </dependency> <dependency> <groupId>ch.qos.logback</groupId> <artifactId>logback-core</artifactId> <version>1.2.3</version> </dependency> <!-- 实现slf4j接口并整合 --> <dependency> <groupId>ch.qos.logback</groupId> <artifactId>logback-classic</artifactId> <version>1.2.3</version> </dependency> <!-- 数据库部分 --> <dependency> <groupId>mysql</groupId> <artifactId>mysql-connector-java</artifactId> <version>5.1.34</version> <scope>runtime</scope> </dependency> <!-- c3p0连接池 --> <dependency> <groupId>c3p0</groupId> <artifactId>c3p0</artifactId> <version>0.9.1.2</version> </dependency> <!-- dao框架:mybatis --> <dependency> <groupId>org.mybatis</groupId> <artifactId>mybatis</artifactId> <version>3.4.0</version> </dependency> <!-- mybatis 整合spring --> <dependency> <groupId>org.mybatis</groupId> <artifactId>mybatis-spring</artifactId> <version>1.3.0</version> </dependency> <!-- servlet web依赖 --> <dependency> <groupId>taglibs</groupId> <artifactId>standard</artifactId> <version>1.1.2</version> </dependency> <dependency> <groupId>jstl</groupId> <artifactId>jstl</artifactId> <version>1.2</version> </dependency> <dependency> <groupId>com.fasterxml.jackson.core</groupId> <artifactId>jackson-databind</artifactId> <version>2.5.1</version> </dependency> <dependency> <groupId>javax.servlet</groupId> <artifactId>javax.servlet-api</artifactId> <version>3.1.0</version> </dependency> <!-- spring 依赖 --> <!-- 1.spring核心依赖 --> <dependency> <groupId>org.springframework</groupId> <artifactId>spring-core</artifactId> <version>4.2.5.RELEASE</version> </dependency> <dependency> <groupId>org.springframework</groupId> <artifactId>spring-beans</artifactId> <version>4.2.5.RELEASE</version> </dependency> <dependency> <groupId>org.springframework</groupId> <artifactId>spring-context</artifactId> <version>4.2.5.RELEASE</version> </dependency> <!-- 2.spring dao 层依赖 --> <dependency> <groupId>org.springframework</groupId> <artifactId>spring-jdbc</artifactId> <version>4.2.5.RELEASE</version> </dependency> <dependency> <groupId>org.springframework</groupId> <artifactId>spring-tx</artifactId> <version>4.2.5.RELEASE</version> </dependency> <!-- spring web --> <dependency> <groupId>org.springframework</groupId> <artifactId>spring-web</artifactId> <version>4.2.5.RELEASE</version> </dependency> <dependency> <groupId>org.springframework</groupId> <artifactId>spring-webmvc</artifactId> <version>4.2.5.RELEASE</version> </dependency> <!-- spring test 依赖 --> <dependency> <groupId>org.springframework</groupId> <artifactId>spring-test</artifactId> <version>4.2.6.RELEASE</version> </dependency> <!-- webmagic 网络爬虫jar --> <dependency> <groupId>us.codecraft</groupId> <artifactId>webmagic-extension</artifactId> <version>0.7.3</version> </dependency> </dependencies>
数据库表SQL:
CREATE TABLE `Boke` ( `id` int(11) NOT NULL AUTO_INCREMENT COMMENT 'id', `title` varchar(255) DEFAULT NULL COMMENT '标题', `linke` varchar(255) DEFAULT NULL COMMENT '正文地址', `author` varchar(255) DEFAULT NULL COMMENT '作者', `authorUrl` varchar(255) DEFAULT NULL COMMENT '作者主页', `summary` varchar(1000) DEFAULT NULL COMMENT '简介', PRIMARY KEY (`id`) ) ENGINE=InnoDB AUTO_INCREMENT=4890 DEFAULT CHARSET=utf8;
数据库链接工具类:

import java.sql.DriverManager; import java.sql.SQLException; import com.mysql.jdbc.Connection; public class MySqlJdbcUtils { private static String driver = "com.mysql.jdbc.Driver"; private static String url = "jdbc:mysql://192.168.0.132:3306/xbDB?useUnicode=true&characterEncoding=utf-8"; private static String name="tradingbp"; private static String pwd="123456"; /** * * 获取链接 * * @date 2017年8月31日 * @return */ public static Connection getOpenConnection(){ Connection conn= null; try { //加载驱动 Class.forName(driver); conn=(Connection) DriverManager.getConnection(url, name, pwd); System.out.println("获得数据库链接"); } catch (ClassNotFoundException e) { e.printStackTrace(); }catch (SQLException e) { e.printStackTrace(); } return conn; } public static void main(String[] args) { getOpenConnection(); } }
实体类:

/** * *java 博客实体 * * @date 2017年8月24日 * @see [相关类/方法] * @since [产品/模块版本] */ public class JavaBokeModel { //标题 private String title; //链接地址 private String linke; //作者 private String author; //作者主页地址 private String authorUrl; //简介 private String summary; public String getSummary() { return summary; } public void setSummary(String summary) { this.summary = summary; } public String getTitle() { return title; } public void setTitle(String title) { this.title = title; } public String getLinke() { return linke; } public void setLinke(String linke) { this.linke = linke; } public String getAuthor() { return author; } public void setAuthor(String author) { this.author = author; } public String getAuthorUrl() { return authorUrl; } public void setAuthorUrl(String authorUrl) { this.authorUrl = authorUrl; } }
webmagic 框架爬取数据并保存
import java.sql.PreparedStatement; import java.sql.SQLException; import java.util.ArrayList; import java.util.Date; import java.util.List; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Spider; import us.codecraft.webmagic.processor.PageProcessor; import com.mysql.jdbc.Connection; import com.nio.webmagic.jdbc.MySqlJdbcUtils; import com.nio.webmagic.model.JavaBokeModel; /** * * 爬虫 * * @version [VCES V201R001, 2017年10月12日] * * @see 方法实现 PageProcessor * @since [产品/模块版本] */ public class JavaBoKePageProcessor implements PageProcessor { private static Connection conn=null; private static PreparedStatement ps =null; //标题和链接获取 private static String TITLEQUERY="div.post_item_body h3 a.titlelnk"; //作者 private static String AUTHORQUERY="div.post_item_foot a.lightblue "; //简介 private static String SUMMARYQUERY="div.post_item_body p.post_item_summary"; //插入sql语句 private static String insertSql ="INSERT INTO Boke (title,linke,author,authorUrl,summary)VALUES(?,?,?,?,?)"; //初始链接 private static Connection getConnection(){ if (conn==null) { conn = MySqlJdbcUtils.getOpenConnection(); } return conn; } /** * * insert操作 * * @date 2017年8月31日 * @return */ private synchronized void insertDb(List<JavaBokeModel> javaBokes){ try { ps = conn.prepareStatement(insertSql); for (JavaBokeModel javaBoke:javaBokes) { ps.setString(1, javaBoke.getTitle().toString()); ps.setString(2, javaBoke.getLinke().toString()); ps.setString(3, javaBoke.getAuthor().toString()); ps.setString(4, javaBoke.getAuthorUrl().toString()); ps.setString(5, javaBoke.getSummary().toString()); ps.executeUpdate(); } } catch (SQLException e) { // TODO Auto-generated catch block e.printStackTrace(); } } //初始化带爬取网页地址 private static List<String> urls(){ List<String> listUrl =new ArrayList<String>(); for (int i = 2; i <=200; i++) { //listUrl.add("http://www.cnblogs.com/cate/java/"+i); listUrl.add("http://www.cnblogs.com/cate/java/"+i); } listUrl.toArray(new String[listUrl.size()]); return listUrl; } /** * * jsoup根据 html 字符串和语法获取内容; * @date 2017年8月31日 * @param htmlText * @return */ private static String seletDocumentText(String htmlText,String Query){ Document doc = Jsoup.parse(htmlText); String select = doc.select(Query).text(); return select; } /** * * jsoup根据 html 字符串和语法获取链接地址; * @date 2017年8月31日 * @param htmlText * @return */ private static String seletDocumentLink(String htmlText,String Query){ Document doc = Jsoup.parse(htmlText); String select = doc.select(Query).attr("href"); return select; } /** * process是定制爬虫逻辑的核心接口,在这里编写抽取逻辑 * @see us.codecraft.webmagic.processor.PageProcessor#process(us.codecraft.webmagic.Page) */ @Override public void process(Page page) { // page.addTargetRequests(urls()); //div[@class='post_item']//div[@class='post_item_body']//h3//a[@class='titlelnk']/text()' // 定义如何抽取页面信息,并保存下来 List<String> htmls =page.getHtml().xpath("//div[@class='post_item']/html()").all(); List<JavaBokeModel> javaBokes=new ArrayList<JavaBokeModel>(); for (String html:htmls) { JavaBokeModel javaBoke =new JavaBokeModel(); //标题和链接 String title =seletDocumentText(html,TITLEQUERY); String linke =seletDocumentLink(html,TITLEQUERY); //作者和作者主页 String author=seletDocumentText(html, AUTHORQUERY); String authorUrl=seletDocumentLink(html, AUTHORQUERY); //简介 String summary=seletDocumentText(html, SUMMARYQUERY); javaBoke.setTitle(title); javaBoke.setAuthor(author); javaBoke.setAuthorUrl(authorUrl); javaBoke.setLinke(linke); javaBoke.setSummary(summary); javaBokes.add(javaBoke); } insertDb(javaBokes); } @Override public Site getSite() { //抓去网站的相关配置包括:编码、重试次数、抓取间隔 return Site.me().setSleepTime(1000).setRetryTimes(10); } public static void main(String[] args) { long startTime ,endTime; System.out.println("========小爬虫【启动】喽!========="); getConnection(); startTime = new Date().getTime(); //入口 Spider create = Spider.create(new JavaBoKePageProcessor()); //定义入口地址 create.addUrl("http://www.cnblogs.com/cate/java/").thread(5).run(); try { ps.close(); conn.close(); } catch (Exception e) { // TODO: handle exception } endTime = new Date().getTime(); System.out.println("========小爬虫【结束】喽!========="); System.out.println("用时为:"+(endTime-startTime)/1000+"s"); } }
数据: