今天學完爬蟲之后想的爬一下我們學校的教務系統,可是發現登錄的時候有驗證碼。因此研究了Jsoup爬取帶驗證碼的網站:
大體的思路是:(需要注意的是__VIEWSTATE一直變化,所以我們每個頁面都需要重新獲取並帶着爬取下一個頁面)
1.先爬取網站的主頁,由於我們學校的網站是ASP.net,所以需要爬到每個網頁的__VIEWSTATE。同時爬取主頁也可以獲得一個cookie(ASP.sessionId)
2.帶着__VIEWSTATE和ASP.sessionId爬取驗證碼。(網上說有專門識別驗證碼的軟件,在這里我只是把驗證碼下載到本地之后,需要用戶輸入驗證碼)獲取驗證碼圖片的時候需要帶着cookie去獲取,來標識是本次session請求的驗證碼,如果不帶sessionid下載驗證碼之后輸入驗證碼也無效。
3.輸入用戶名,密碼和驗證碼登錄系統,登錄系統需要攜帶一些其他參數(值為空也需要攜帶)。
4.登錄之后不能直接爬取成績,需要爬蟲登錄成功之后的主頁面獲取__viewstate。
5.爬完登錄成功的主頁之后就可以進行爬取成績,將爬到的成績收集起來,最后輸出到html頁面中。
(在這個爬蟲的過程中需要注意__viewstate,每個頁面都需要獲取這個值,這個值是放在input隱藏域中。另外爬取過程中請求頭攜帶REFER參數(也就是表示你從哪個網站過來的),防止盜鏈)
下面是代碼:
1.爬蟲的入口
package cn.qlq.craw.JsoupCrawJWXT; import java.io.IOException; import java.util.ArrayList; import java.util.List; import java.util.Map.Entry; import java.util.Scanner; /** * 爬蟲主的程序調度器(爬蟲教務系統的入口) * * @author liqiang * */ public class MainClass { public static void main(String[] args) { // 輸入學號和密碼 System.out.print("請輸入你要查詢學號:"); Scanner sc = new Scanner(System.in); String xuehao = sc.next(); System.out.print("請輸入密碼:"); String password = sc.next(); // Console con = System.console(); // String pswd = new String(con.readPassword());// 因為讀取的是字符數組,所以需要用new try { DownloadLoginfo downloadLoginfo = new DownloadLoginfo(); LoginClass loginClass = new LoginClass(); GradeOutput gradeOutput = new GradeOutput(); // 1.訪問主頁,獲取驗證碼與viewstate downloadLoginfo.getLogInfo(); // 2.登錄 loginClass.login(downloadLoginfo, xuehao, password); for (Entry<String, String> entry : loginClass.getCookies().entrySet()) { System.out.println("key:" + entry.getKey() + ";value" + entry.getValue()); } CrawGrade crawGrade = new CrawGrade(); //3. 爬取成績的上一個頁面 crawGrade.crawGradeLastPage(downloadLoginfo.getCookies(), downloadLoginfo.getViewState(), xuehao); List<String> condition = geneQueryCondition(); //4.循環分學年爬取成績 for (String xuenian : condition) { String html_content = crawGrade.crawGrade(xuenian, "2", downloadLoginfo.getCookies(), // 4.1爬取成績頁面 downloadLoginfo.getViewState(), xuehao); gradeOutput.collectGrade(html_content); } //5.輸出爬到的數據到html文件中 gradeOutput.outputDatas2Html(); } catch (IOException e) { System.out.println("無法連接學校服務器"); } catch (Exception e) { e.printStackTrace(); } } /** * 構造需要查詢的年份和學期 * * @return */ public static List<String> geneQueryCondition() { List<String> condition = new ArrayList<String>(); condition.add("2014-2015"); condition.add("2015-2016"); condition.add("2016-2017"); condition.add("2017-2018"); return condition; } }
2.爬取學校主頁獲取__VIEWSTATE和cookie
package cn.qlq.craw.JsoupCrawJWXT; import java.util.HashMap; import java.util.Map; import java.util.Map.Entry; import org.jsoup.Connection; import org.jsoup.Connection.Method; import org.jsoup.Connection.Response; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; /** * url獲取圖片並且保存到本地 * * @author liqiang * */ public class DownloadLoginfo { /** * 第一次訪問獲取的cookie(查看發現就返回一個cookie:ASP.NET_SessionId) */ private Map<String, String> cookies = null; /** * __viewstate 教務系統用於驗證的信息 */ private String viewState = null; public DownloadLoginfo() { this.cookies = new HashMap<String,String>();; this.viewState = ""; } /** * 獲取登錄信息 * 主要就是訪問一下主頁面,獲取一個__viewstate與cookie */ public void getLogInfo() throws Exception { String urlLogin = "http://newjwc.tyust.edu.cn/"; Connection connect = Jsoup.connect(urlLogin); // 偽造請求頭 connect.header("Accept", "application/json, text/javascript, */*; q=0.01").header("Accept-Encoding", "gzip, deflate"); connect.header("Accept-Language", "zh-CN,zh;q=0.9").header("Connection", "keep-alive"); connect.header("Content-Length", "213").header("Content-Type", "application/x-www-form-urlencoded; charset=UTF-8"); connect.header("Host", "newjwc.tyust.edu.cn").header("Referer", "http://newjwc.tyust.edu.cn/"); connect.header("User-Agent", "Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36") .header("X-Requested-With", "XMLHttpRequest"); // 請求url獲取響應信息 Response res = connect.ignoreContentType(true).method(Method.POST).execute();// 執行請求 // 獲取返回的cookie this.cookies = res.cookies(); for (Entry<String, String> entry : cookies.entrySet()) { System.out.println(entry.getKey() + "-" + entry.getValue()); } // 獲取響應體 String body = res.body(); // 調用下面方法獲取__viewstate this.getViewState(body);// 獲取viewState //調用下載驗證碼的工具類下載驗證碼 JsoupDoloadPicture.downloadImg("http://newjwc.tyust.edu.cn/CheckCode.aspx", cookies);; } /** * 獲取viewstate * * @return */ public String getViewState(String htmlContent) { Document document = Jsoup.parse(htmlContent); Element ele = document.select("input[name='__VIEWSTATE']").first(); String value = ele.attr("value"); // 獲取到viewState this.viewState = value; return value; } public Map<String, String> getCookies() { return cookies; } public void setCookies(Map<String, String> cookies) { this.cookies = cookies; } public String getViewState() { return viewState; } public void setViewState(String viewState) { this.viewState = viewState; } }
3.帶着驗證碼爬取驗證碼,並下載到本地
package cn.qlq.craw.JsoupCrawJWXT; import java.io.BufferedOutputStream; import java.io.File; import java.io.FileNotFoundException; import java.io.FileOutputStream; import java.io.IOException; import java.util.Map; import org.apache.commons.io.FileUtils; import org.jsoup.Connection; import org.jsoup.Jsoup; /** * Jsoup帶着cookie下載驗證碼到本地(必須帶着cookie下載驗證碼,否則下載的驗證碼無效) * * @author liqiang * */ public class JsoupDoloadPicture { /** * 帶着cookie下載驗證碼圖片 * * @param url * @param cookies * @throws IOException */ public static void downloadImg(String url, Map<String, String> cookies) throws IOException { // TODO Auto-generated method stub Connection connect = Jsoup.connect(url); connect.cookies(cookies);// 攜帶cookies爬取圖片 connect.timeout(5 * 10000); Connection.Response response = connect.ignoreContentType(true).execute(); byte[] img = response.bodyAsBytes(); System.out.println(img.length); // 讀取文件存儲位置 String directory = ResourcesUtil.getValue("path", "file"); savaImage(img, directory, "yzm.png"); } /** * 保存圖片到本地 * @param img * @param filePath * @param fileName */ public static void savaImage(byte[] img, String filePath, String fileName) { BufferedOutputStream bos = null; FileOutputStream fos = null; File file = null; File dir = new File(filePath); try { //判斷文件目錄是否存在 if(dir.exists() && !dir.isDirectory()){ FileUtils.deleteQuietly(dir); } dir.mkdir(); file = new File(filePath + "\\" + fileName); fos = new FileOutputStream(file); bos = new BufferedOutputStream(fos); bos.write(img); System.out.println("驗證碼已經下載到:"+filePath); } catch (FileNotFoundException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } finally { if (bos != null) { try { bos.close(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } } if (fos != null) { try { fos.close(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } } } } }
4.登錄類
package cn.qlq.craw.JsoupCrawJWXT; import java.util.Map; import java.util.Map.Entry; import java.util.Scanner; import org.jsoup.Connection; import org.jsoup.Connection.Method; import org.jsoup.Connection.Response; import org.jsoup.Jsoup; /** * 登錄類(訪問登錄頁面獲取登錄的cookie) * * @author liqiang * */ public class LoginClass { /** * 記錄返回的cookie */ private Map<String, String> cookies = null; /** * 模擬登錄獲取cookie和sessionid * */ public void login(DownloadLoginfo downloadLoginfo, String xuehao, String mima) throws Exception { String urlLogin = "http://newjwc.tyust.edu.cn/default2.aspx"; Connection connect = Jsoup.connect(urlLogin); connect.timeout(5 * 100000); // 偽造請求頭 connect.header("Content-Length", "213").header("Content-Type", "application/x-www-form-urlencoded"); connect.header("Host", "newjwc.tyust.edu.cn").header("Referer", "http://newjwc.tyust.edu.cn/xscjcx.aspx?xh=" + xuehao + "&xm=%C7%C7%C0%FB%C7%BF&gnmkdm=N121613"); connect.header("User-Agent", "Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36"); // 輸入驗證碼 System.out.println("-----------請輸入驗證碼---------"); Scanner sc = new Scanner(System.in); String yzm = sc.next(); sc.close(); // 攜帶登陸信息 connect.data("txtUserName", xuehao).data("__VIEWSTATE", downloadLoginfo.getViewState()).data("TextBox2", mima) .data("Textbox1", "").data("RadioButtonList1", "").data("Button1", "").data("lbLanguage", "") .data("hidPdrs", "").data("hidsc", "").data("txtSecretCode", yzm); connect.cookies(downloadLoginfo.getCookies()); // 請求url獲取響應信息 Response res = connect.ignoreContentType(true).method(Method.POST).execute();// 執行請求 // 獲取返回的cookie this.cookies = res.cookies(); for (Entry<String, String> entry : cookies.entrySet()) { System.out.println(entry.getKey() + "-" + entry.getValue()); } System.out.println("---------獲取的登錄之后的頁面-----------"); String body = res.body();// 獲取響應體 System.out.println(body); } public Map<String, String> getCookies() { return cookies; } public void setCookies(Map<String, String> cookies) { this.cookies = cookies; } }
5.爬取登錄之后的主頁和成績
package cn.qlq.craw.JsoupCrawJWXT; import java.io.IOException; import java.util.Map; import org.jsoup.Connection; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; /** * 爬取成績的類 * * @author liqiang * */ public class CrawGrade { private String viewState; /** * 全局獲取viewstate的函數 * @param html * @return */ public String getViewState(String html){ Document document = Jsoup.parse(html); Element ele = document.select("input[name='__VIEWSTATE']").first(); String value = ele.attr("value"); this.viewState = value; // 獲取到viewState return value; } /** * 爬取獲取成績的上一個頁面(也就是剛登陸之后的頁面) * @param cookies * @param viewStata * @param xuehao * @return * @throws IOException */ public String crawGradeLastPage(Map<String,String> cookies,String viewStata,String xuehao) throws IOException{ String urlLogin = "http://newjwc.tyust.edu.cn/xscjcx.aspx?xh="+xuehao+"&xm=%C7%C7%C0%FB%C7%BF&gnmkdm=N121613"; Connection connect = Jsoup.connect(urlLogin); connect.timeout(5 * 100000); // 偽造請求頭 connect.header("Content-Length", "74642").header("Content-Type", "application/x-www-form-urlencoded"); connect.header("Host", "newjwc.tyust.edu.cn").header("Referer", "http://newjwc.tyust.edu.cn/xscjcx.aspx?xh=201420020123&xm=%C7%C7%C0%FB%C7%BF&gnmkdm=N121613"); connect.header("User-Agent", "Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36"); // 攜帶登陸信息 connect.data("xh","201420020123") .data("xm", viewStata) .data("hidLanguage", "") .data("gnmkdm", "N121613"); //設置cookie connect.cookies(cookies); Document document = connect.post(); System.out.println("-----------爬到的成績的上一個頁面--------------"); String html = document.toString(); System.out.println(html); // 重新獲取到viewState this.getViewState(html); return html; } /** * 爬取成績頁面 */ public String crawGrade(String xuenian,String xueqi,Map<String,String> cookies,String viewStata,String xuehao) throws IOException{ String urlLogin = "http://newjwc.tyust.edu.cn/xscjcx.aspx?xh="+xuehao+"&xm=%C7%C7%C0%FB%C7%BF&gnmkdm=N121613"; Connection connect = Jsoup.connect(urlLogin); connect.timeout(5 * 100000); // 偽造請求頭 connect.header("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8") .header("Accept-Encoding", "gzip, deflate"); connect.header("Accept-Language", "zh-CN,zh;q=0.9").header("Connection", "keep-alive"); connect.header("Content-Length", "74642").header("Content-Type", "application/x-www-form-urlencoded"); connect.header("Host", "newjwc.tyust.edu.cn").header("Referer", "http://newjwc.tyust.edu.cn/xscjcx.aspx?xh=201420020123&xm=%C7%C7%C0%FB%C7%BF&gnmkdm=N121613"); connect.header("User-Agent", "Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36"); // 攜帶登陸信息 connect.data("__EVENTTARGET","") .data("__EVENTARGUMENT", "") .data("__VIEWSTATE", this.viewState) .data("hidLanguage","") .data("ddlXN", xuenian) .data("ddlXQ", xueqi) .data("btn_xn", "") .data("ddl_kcxz", ""); connect.cookies(cookies); Document document = connect.post(); System.out.println("-----------爬到的成績的頁面--------------"); String html = document.toString(); //更新viewstate this.getViewState(html); System.out.println(html); return html; } public void setViewState(String viewState) { this.viewState = viewState; } }
6.收集成績的類
package cn.qlq.craw.JsoupCrawJWXT; import java.io.File; import java.io.FileWriter; import java.io.IOException; import java.util.ArrayList; import java.util.LinkedHashMap; import java.util.List; import java.util.Map; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; /** * 收集成績與輸出成績 * * @author liqiang * */ @SuppressWarnings("all") public class GradeOutput { /** * 保存成績的集合 */ private List<Map<String, Object>> datas; public GradeOutput() { this.datas = new ArrayList<Map<String, Object>>(); } /** * 收集成績 * * @param html * @return */ public String collectGrade(String html) { // 解析html Document document = Jsoup.parse(html); // 獲取成績表格 Element table = document.select("#Datagrid1").first(); // 選擇除表格表頭之外的元素 Elements trs = table.select("tr:gt(0)"); for (Element ele : trs) { Map result = new LinkedHashMap(); Elements ele0 = ele.select("td:eq(0)");// 找到學年 result.put("xuenian", ele0.text()); Elements ele1 = ele.select("td:eq(1)");// 找到學期 result.put("xueqi", ele1.text()); Elements ele3 = ele.select("td:eq(3)");// 找到課程名稱 result.put("kecheng", ele3.text()); Elements ele8 = ele.select("td:eq(8)");// 找到成績 result.put("chengji", ele8.text()); this.datas.add(result); } return null; } /** * 輸出成績到控制台 */ public void outPutGrade() { if (this.datas == null || this.datas.size() == 0) { return; } System.out.println("-------下面是提取到的成績--------"); for (Map result : datas) { System.out.println(result.get("xuenian") + "\t" + result.get("xueqi") + "\t" + result.get("kecheng") + "\t" + result.get("chengji") + "\t"); } } /** * 最后處理所有的數據,寫出到html或者保存數據庫 * * @throws IOException */ public void outputDatas2Html() throws IOException { if (datas != null && datas.size() > 0) { // 讀取文件存儲位置 String directory = ResourcesUtil.getValue("path", "file"); File file = new File(directory+"\\gradeOut.html"); // 如果文件不存在就創建文件 if (!file.exists()) { file.createNewFile(); } // 構造FileWriter用於向文件中輸出信息(此構造方法可以接收file參數,也可以接收fileName參數) FileWriter fileWriter = new FileWriter(file); // 開始寫入數據 fileWriter.write("<html>"); fileWriter.write("<head>"); fileWriter.write("<title>xxx成績單</title>"); fileWriter .write("<style>table{width:100%;table-layout: fixed;word-break: break-all; word-wrap: break-word;}" + "table td{border:1px solid black;width:300px}</style>"); fileWriter.write("</head>"); fileWriter.write("<body>"); fileWriter.write("<table cellpadding='0' cellspacing='0' style='text-align:center;'>"); fileWriter.write( "<tr style='background-color:#95caca;font-size:20px'><td>學年</td><td>學期</td><td>課程名字</td><td>成績</td></tr>"); for (Map<String, Object> data : datas) { String xuenian = (String) data.get("xuenian"); String xueqi = (String) data.get("xueqi"); String kecheng = (String) data.get("kecheng"); String chengji = (String) data.get("chengji"); fileWriter.write("<tr>"); fileWriter.write("<td>" + xuenian + "</td>"); fileWriter.write("<td>" + xueqi + "</td>"); fileWriter.write("<td>" + kecheng + "</td>"); fileWriter.write("<td>" + chengji + "</td>"); fileWriter.write("</tr>"); } fileWriter.write("</table>"); fileWriter.write("</body>"); fileWriter.write("</html>"); // 關閉文件流 fileWriter.close(); } } public List<Map<String, Object>> getDatas() { return datas; } public void setDatas(List<Map<String, Object>> datas) { this.datas = datas; } }
path.properties (設置驗證碼圖片和最后的成績單輸出到哪個位置)
#fileToSave
#yzm
file=C:\\Users\\liqiang\\Desktop
讀取上述配置文件的工具類:
package cn.qlq.craw.JsoupCrawJWXT; import java.io.Serializable; import java.text.MessageFormat; import java.util.ArrayList; import java.util.Iterator; import java.util.List; import java.util.Locale; import java.util.ResourceBundle; import java.util.Set; /** * 資源文件讀取工具類 * */ public class ResourcesUtil implements Serializable { private static final long serialVersionUID = -7657898714983901418L; /** * 系統語言環境,默認為中文zh */ public static final String LANGUAGE = "zh"; /** * 系統國家環境,默認為中國CN */ public static final String COUNTRY = "CN"; private static Locale getLocale() { Locale locale = new Locale(LANGUAGE, COUNTRY); return locale; } /** * 根據語言、國家、資源文件名和key名字獲取資源文件值 * * @param language * 語言 * * @param country * 國家 * * @param baseName * 資源文件名 * * @param section * key名字 * * @return 值 */ private static String getProperties(String baseName, String section) { String retValue = ""; try { Locale locale = getLocale(); ResourceBundle rb = ResourceBundle.getBundle(baseName, locale); retValue = (String) rb.getObject(section); } catch (Exception e) { e.printStackTrace(); // TODO 添加處理 } return retValue; } /** * 通過key從資源文件讀取內容 * * @param fileName * 資源文件名 * * @param key * 索引 * * @return 索引對應的內容 */ public static String getValue(String fileName, String key) { String value = getProperties(fileName,key); return value; } public static List<String> gekeyList(String baseName) { Locale locale = getLocale(); ResourceBundle rb = ResourceBundle.getBundle(baseName, locale); List<String> reslist = new ArrayList<String>(); Set<String> keyset = rb.keySet(); for (Iterator<String> it = keyset.iterator(); it.hasNext();) { String lkey = (String)it.next(); reslist.add(lkey); } return reslist; } /** * 通過key從資源文件讀取內容,並格式化 * * @param fileName * 資源文件名 * * @param key * 索引 * * @param objs * 格式化參數 * * @return 格式化后的內容 */ public static String getValue(String fileName, String key, Object[] objs) { String pattern = getValue(fileName, key); String value = MessageFormat.format(pattern, objs); return value; } public static void main(String[] args) { System.out.println(getValue("resources.messages", "101",new Object[]{100,200})); //根據操作系統環境獲取語言環境 /*Locale locale = Locale.getDefault(); System.out.println(locale.getCountry());//輸出國家代碼 System.out.println(locale.getLanguage());//輸出語言代碼s //加載國際化資源(classpath下resources目錄下的messages.properties,如果是中文環境會優先找messages_zh_CN.properties) ResourceBundle rb = ResourceBundle.getBundle("resources.messages", locale); String retValue = rb.getString("101");//101是messages.properties文件中的key System.out.println(retValue); //信息格式化,如果資源中有{}的參數則需要使用MessageFormat格式化,Object[]為傳遞的參數,數量根據資源文件中的{}個數決定 String value = MessageFormat.format(retValue, new Object[]{100,200}); System.out.println(value); */ } }
git地址:https://github.com/qiao-zhi/javaCraw