WebMagic框架教程 http://webmagic.io/docs/zh/
爬取世紀佳緣小姐姐信息
/** * @auther mxh * @time 2019/5/17 13:44 * 信息實體類 */ public class Info { private Integer id; //昵稱 private String name; // 照片 private String image; //基本信息 private String info; //愛情宣言 private String mottos; //推薦理由 private String reason; public Info() { } public Info(String name, String image, String info, String mottos, String reason) { this.name = name; this.image = image; this.info = info; this.mottos = mottos; this.reason = reason; } public Integer getId() { return id; } public void setId(Integer id) { this.id = id; } public String getName() { return name; } public void setName(String name) { this.name = name; } public String getImage() { return image; } public void setImage(String image) { this.image = image; } public String getInfo() { return info; } public void setInfo(String info) { this.info = info; } public String getMottos() { return mottos; } public void setMottos(String mottos) { this.mottos = mottos; } public String getReason() { return reason; } public void setReason(String reason) { this.reason = reason; } @Override public String toString() { return "Info{" + "id=" + id + ", name='" + name + '\'' + ", image='" + image + '\'' + ", info='" + info + '\'' + ", mottos='" + mottos + '\'' + ", reason='" + reason + '\'' + '}'; }
dao層
import org.springframework.stereotype.Repository; /** * @auther mxh * @time 2019/5/17 13:46 */ @Repository public interface SJJYMapper { int addInfo(Info info); }
爬蟲框架持久層
import org.springframework.beans.factory.annotation.Autowired; import org.springframework.stereotype.Service; import us.codecraft.webmagic.ResultItems; import us.codecraft.webmagic.Task; import us.codecraft.webmagic.pipeline.Pipeline; /** * @auther mxh * @time 2019/5/17 13:59 * * 爬蟲框架dao層 */ @Service public class SJJYPipeline implements Pipeline { @Autowired private SJJYMapper sjjyMapper; @Override public void process(ResultItems resultItems, Task task) { System.out.println("get page: " + resultItems.getRequest().getUrl()); String[] names = resultItems.get("names").toString().split(","); String[] images = resultItems.get("images").toString().split(","); String[] infos = resultItems.get("infos").toString().split(","); String[] mottoes = resultItems.get("mottoes").toString().split(","); String[] reasons = resultItems.get("reasons").toString().split(","); for (int i=0;i<names.length;i++){ Info info = new Info(names[i],images[i],infos[i],mottoes[i],reasons[i]); sjjyMapper.addInfo(info); System.out.println("add info: " + info.toString()); } } }
爬蟲框架數據篩選邏輯層
import org.apache.http.Header; import org.apache.http.HttpResponse; import org.apache.http.NameValuePair; import org.apache.http.client.config.RequestConfig; import org.apache.http.client.entity.UrlEncodedFormEntity; import org.apache.http.client.methods.CloseableHttpResponse; import org.apache.http.client.methods.HttpGet; import org.apache.http.client.methods.HttpPost; import org.apache.http.client.protocol.HttpClientContext; import org.apache.http.client.utils.DateUtils; import org.apache.http.client.utils.URIBuilder; import org.apache.http.config.Registry; import org.apache.http.config.RegistryBuilder; import org.apache.http.cookie.CookieOrigin; import org.apache.http.cookie.CookieSpecProvider; import org.apache.http.cookie.MalformedCookieException; import org.apache.http.impl.client.CloseableHttpClient; import org.apache.http.impl.client.DefaultHttpClient; import org.apache.http.impl.client.HttpClients; import org.apache.http.impl.cookie.DefaultCookieSpec; import org.apache.http.message.BasicHeader; import org.openqa.selenium.By; import org.openqa.selenium.Cookie; import org.openqa.selenium.WebDriver; import org.openqa.selenium.chrome.ChromeDriver; import org.springframework.stereotype.Service; import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.processor.PageProcessor; import us.codecraft.webmagic.selector.Html; import java.io.*; import java.net.HttpURLConnection; import java.net.URISyntaxException; import java.net.URL; import java.net.URLEncoder; import java.util.*; /** * @auther mxh * @time 2019/5/16 17:01 * * 爬蟲框架數據篩選邏輯層 */ @Service public class SJJYProcessor implements PageProcessor { private Site site = Site.me().setCharset("utf8").setRetryTimes(1000).setSleepTime(1000); // 用來存儲cookie信息 private Set<Cookie> cookies; @Override public void process(Page page) { Html html = page.getHtml(); //照片 List<String> images = html.xpath("//li/div[@class=\"jysp_huiyuan\"]/div[@class=\"jysp_huiyuanPic\"]/a/img/@_src").all(); //姓名 List<String> names = html.xpath("//li/div[@class=\"jysp_huiyuan\"]/div[@class=\"jysp_huiyuanInfo\"]/div[@class=\"jysp_huiyuanName\"]/a/text()").all(); //基本信息 List<String> infos = html.xpath("//li/div[@class=\"jysp_huiyuan\"]/div[@class=\"jysp_huiyuanInfo\"]/div[@class=\"jysp_huiyuanArea\"]/text()").all(); //愛情宣言 List<String> mottos = html.xpath("//li/div[@class=\"jysp_huiyuan\"]/div[@class=\"jysp_huiyuanInfo\"]/div[@class=\"jysp_huiyuanText\"]/text()").all(); //推薦理由 List<String> reasons = html.xpath("//li/div[@class=\"jysp_huiyuan\"]/div[@class=\"jysp_huiyuanInfo\"]/div[@class=\"jysp_huiyuanLy\"]/text()").all(); /*輸出到控制台 並使dao層接收到數據*/ page.putField("names",names); page.putField("images",images); page.putField("infos",infos); page.putField("mottoes",mottos); page.putField("reasons",reasons); } @Override public Site getSite() { //設置主機地址 site.setDomain("www.jiayuan.com"); //手動設置cookie //site.addCookie("PHPSESSID","f16de947c3a48a1084d22dd7e72cd283"); /*site.addCookie("PHPSESSID","8b392aacbf80a4d6cf102938271273a7"); site.addCookie("COMMON_HASH","0d8c3daa82c80277292723d74ff197d0"); site.addCookie("PROFILE","207838031%3A%25E5%25BD%25BC%25E5%25BE%2597%25E5%25B8%2595%25E5%2585%258B%3Am%3Aimages1.jyimg.com%2Fw4%2Fglobal%2Fi%3A0%3A%3A1%3Azwzp_m.jpg%3A1%3A1%3A50%3A10%3A3.0"); site.addCookie("RAW_HASH","fYGR2xG5XJL10gfFF4mP3qO0yN65wBrTZpeOrelDWKHerbx69EjQ138l9BfHlTYP%2AGuyrs-5xYCSsUMipqBNkKqExN%2AWVe7sWAWAa5w8VXf-TMA."); site.addCookie("SESSION_HASH","c2dbd047d891295d1b3e4d5b4cb687e71eeb1afd"); site.addCookie("accessID","20190516163650639629"); site.addCookie("ip_loc","31"); site.addCookie("save_jy_login_name","15735400536"); site.addCookie("stadate1","206838031"); site.addCookie("user_access","1"); site.addCookie("main_search:207838031","%7C%7C%7C00"); site.addCookie("last_login_time","1558057676");*/ //自動追加 for (org.apache.http.cookie.Cookie cookie : cookies) {
site.addCookie(cookie.getName().toString(), cookie.getValue().toString());
}
return site;
}
// 自動登陸方法
public void login() {
//注冊chrome
System.setProperty("webdriver.chrome.driver", "D:\\chromedriver.exe");
WebDriver driver = new ChromeDriver();
driver.get("http://login.jiayuan.com/?refrer=http://www.jiayuan.com&host=0");// 打開網址
// 防止頁面未能及時加載出來而設置一段時間延遲
try {
Thread.sleep(2000);
} catch (InterruptedException e) {
e.printStackTrace();
}
// 設置用戶名密碼
driver.findElement(By.id("login_email")).sendKeys("15735400536"); // 用戶名
driver.findElement(By.id("login_password")).sendKeys("mxh970923"); // 密碼
// 模擬點擊 //form[@id='form-group-login']/button
driver.findElement(By.xpath("//*[@id=\"login_btn\"]"))
.click(); // xpath語言:id為form-group-login的form下的button
// 防止頁面未能及時加載出來而設置一段時間延遲
try {
Thread.sleep(15000);
} catch (InterruptedException e) {
e.printStackTrace();
}
// 獲取cookie信息
cookies = driver.manage().getCookies();
driver.close();
}
controller
import org.apache.http.client.methods.HttpGet; import org.apache.http.impl.client.BasicCookieStore; import org.apache.http.impl.client.CloseableHttpClient; import org.apache.http.impl.client.HttpClients; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.stereotype.Controller; import org.springframework.web.bind.annotation.*; import us.codecraft.webmagic.Spider; import org.apache.http.cookie.Cookie; import java.util.List; import org.apache.http.client.CookieStore; /** * @auther mxh * @time 2019/5/16 17:10 */ @Controller public class SJJYController { @Autowired private SJJYProcessor sjjyProcessor; @Autowired private SJJYPipeline sjjyPipeline; @ResponseBody @RequestMapping(value = "/start",method = RequestMethod.GET) public String start(){ //模擬瀏覽器自動登錄 sjjyProcessor.login(); for (int i=1;i<=9;i++){ Spider.create(sjjyProcessor) .addUrl("http://www.jiayuan.com/usercp/dynmatch/ajax/jymatch_list.php?p="+i) .addPipeline(sjjyPipeline) .thread(5) .run(); } return "success"; } @ResponseBody @RequestMapping(value = "/login",method = RequestMethod.GET) public String login(){ String url ="https://passport.jiayuan.com/dologin.php?pre_url=http://usercp.jiayuan.com/v2/"; try { sjjyProcessor.getCookieBySendPost(url); } catch (Exception e) { e.printStackTrace(); } return "login success"; } @ResponseBody @RequestMapping(value = "/test2",method = RequestMethod.GET) public String test2(){ /*// TODO Auto-generated method stub String url="https://passport.jiayuan.com/dologin.php?pre_url=http://usercp.jiayuan.com/v2/"; //POST的URL HttpPost httppost=new HttpPost(url); //建立HttpPost對象 List<NameValuePair> params=new ArrayList<NameValuePair>(); //建立一個NameValuePair數組,用於存儲欲傳送的參數 params.add(new BasicNameValuePair("pwd","2544")); HttpResponse response = null; //添加參數 try { httppost.setEntity(new UrlEncodedFormEntity(params, HTTP.UTF_8)); //設置編碼 response = new DefaultHttpClient().execute(httppost); } catch (UnsupportedEncodingException e) { e.printStackTrace(); } catch (IOException e){ e.printStackTrace(); } //發送Post,並返回一個HttpResponse對象 //Header header = response.getFirstHeader("Content-Length"); //String Length=header.getValue(); // 上面兩行可以得到指定的Header if(response.getStatusLine().getStatusCode()==200){//如果狀態碼為200,就是正常返回 String result= response.getEntity().getContent(); //得到返回的字符串 System.out.println(result); }*/ // TODO Auto-generated method stub CloseableHttpClient httpClient = null; //創建GET請求 HttpGet httpget = new HttpGet("https://passport.jiayuan.com/dologin.php?pre_url=http://usercp.jiayuan.com/v2/"); String result = null; try { CookieStore cookieStore = new BasicCookieStore(); httpClient = HttpClients.custom().setDefaultCookieStore(cookieStore).build(); httpClient.execute(httpget); String PHPSESSID = null; List<Cookie> cookies = cookieStore.getCookies(); System.out.println(cookies); for (int i = 0; i < cookies.size(); i++) { if (cookies.get(i).getName().equals("PHPSESSID")) { PHPSESSID = cookies.get(i).getValue(); System.out.println(PHPSESSID); } } } catch (Exception ex) { ex.printStackTrace(); } return "Hello World"; } }
application.properties
server.port=8001 mybatis.type-aliases-package=com.example.shijijiayuan.demo mybatis.mapper-locations=classpath*:mapper.xml spring.datasource.url=jdbc:mysql://localhost:3306/****** spring.datasource.username=root spring.datasource.password=root spring.datasource.driver-class-name=com.mysql.jdbc.Driver
mapper.xml
<?xml version="1.0" encoding="UTF-8" ?>
<!DOCTYPE mapper PUBLIC "-//mybatis.org//DTD Mapper 3.0//EN" "http://mybatis.org/dtd/mybatis-3-mapper.dtd" >
<mapper namespace="com.example.shijijiayuan.demo.SJJYMapper" >
<insert id="addInfo" parameterType="com.example.shijijiayuan.demo.Info">
INSERT INTO info(name,image,info,mottos,reason) VALUES(#{name}, #{image}, #{info}, #{mottos}, #{reason})
</insert>
</mapper>
pom.xml
<?xml version="1.0" encoding="UTF-8"?> <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> <modelVersion>4.0.0</modelVersion> <parent> <groupId>org.springframework.boot</groupId> <artifactId>spring-boot-starter-parent</artifactId> <version>2.1.5.RELEASE</version> <relativePath/> <!-- lookup parent from repository --> </parent> <groupId>com.example</groupId> <artifactId>shijijiayuan</artifactId> <version>0.0.1-SNAPSHOT</version> <name>shijijiayuan</name> <description>Demo project for Spring Boot</description> <properties> <java.version>1.8</java.version> </properties> <dependencies> <dependency> <groupId>org.springframework.boot</groupId> <artifactId>spring-boot-starter-web</artifactId> </dependency> <dependency> <groupId>org.springframework.boot</groupId> <artifactId>spring-boot-starter-test</artifactId> <scope>test</scope> </dependency> <!--WebMagic--> <dependency> <groupId>us.codecraft</groupId> <artifactId>webmagic-core</artifactId> <version>0.7.3</version> </dependency> <dependency> <groupId>us.codecraft</groupId> <artifactId>webmagic-extension</artifactId> <version>0.7.3</version> </dependency> <!--myBatis--> <dependency> <groupId>org.mybatis.spring.boot</groupId> <artifactId>mybatis-spring-boot-starter</artifactId> <version>2.0.1</version> </dependency> <dependency> <groupId>mysql</groupId> <artifactId>mysql-connector-java</artifactId> <version>5.1.30</version> </dependency> <dependency> <groupId>org.seleniumhq.selenium</groupId> <artifactId>selenium-api</artifactId> <version>3.14.0</version> </dependency> <dependency> <groupId>org.seleniumhq.selenium</groupId> <artifactId>selenium-chrome-driver</artifactId> <version>3.14.0</version> </dependency> <dependency> <groupId>org.seleniumhq.selenium</groupId> <artifactId>selenium-api</artifactId> <version>3.14.0</version> </dependency> <dependency> <groupId>org.apache.httpcomponents</groupId> <artifactId>httpclient</artifactId> <version>4.5.8</version> </dependency> </dependencies> <build> <plugins> <plugin> <groupId>org.springframework.boot</groupId> <artifactId>spring-boot-maven-plugin</artifactId> </plugin> </plugins> </build> </project>
記得下載相應的瀏覽器驅動,注意版本號要一致哦
博主這里用的是谷歌瀏覽器驅動
世紀佳緣網站登錄要做驗證碼驗證,博主暫時不會寫那么智能的代碼,所以只能手動選擇了
代碼有些jar包可能導的不正確,注意哦,不要盲目copy