最近在用Jsoup抓取某網站數據,可有些頁面是ajax請求動態生成的,去群里問了一下,大神說模擬ajax請求即可。去網上搜索了一下,發現了這篇文章,拿過來先用着試試。
轉帖如下:
網上關於網絡爬蟲實現方式有很多種,但是很多都不支持Ajax,李兄說:模擬才是王道。確實,如果能夠模擬一個沒有界面的瀏覽器,還有什么不能做到的呢? 關於解析Ajax網站的框架也有不少,我選擇了HtmlUnit,官方網站:http://htmlunit.sourceforge.net /,htmlunit可以說是一個Java版本的無界面瀏覽器,幾乎無所不能,而且很多東西都封裝得特別完美。這是這幾天來積累下來的心血,記錄一下。
package com.lanyotech.www.wordbank;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.net.MalformedURLException;
import java.util.List;
import com.gargoylesoftware.htmlunit.FailingHttpStatusCodeException;
import com.gargoylesoftware.htmlunit.NicelyResynchronizingAjaxController;
import com.gargoylesoftware.htmlunit.ScriptResult;
import com.gargoylesoftware.htmlunit.WebClient;
import com.gargoylesoftware.htmlunit.html.HtmlOption;
import com.gargoylesoftware.htmlunit.html.HtmlPage;
import com.gargoylesoftware.htmlunit.html.HtmlSelect;
public class WorldBankCrawl {
private static String TARGET_URL = "http://databank.worldbank.org/ddp/home.do";
public static void main(String[] args) throws FailingHttpStatusCodeException, MalformedURLException, IOException {
//模擬一個瀏覽器
WebClient webClient = new WebClient();
//設置webClient的相關參數
webClient.setJavaScriptEnabled(true);
webClient.setCssEnabled(false);
webClient.setAjaxController(new NicelyResynchronizingAjaxController());
webClient.setTimeout(35000);
webClient.setThrowExceptionOnScriptError(false);
//模擬瀏覽器打開一個目標網址
HtmlPage rootPage= webClient.getPage(TARGET_URL);
//獲取第一個數據庫
HtmlSelect hs = (HtmlSelect) rootPage.getElementById("lstCubes");
//按要求選擇第一個數據庫
hs.getOption(0).setSelected(true);
//模擬點擊Next按鈕,跳轉到第二個頁面
System.out.println("正在跳轉…");
//執行按鈕出發的js事件
ScriptResult sr = rootPage.executeJavaScript("javascript:setCubeData(2,-1,4,'/ddp');");
//跳轉到第二個頁面,選擇國家
HtmlPage countrySelect = (HtmlPage) sr.getNewPage();
//獲得包含全部國家信息的選擇框頁面
HtmlPage framePage=(HtmlPage)countrySelect.getFrameByName("frmTree1″).getEnclosedPage();
//獲得selectAll按鈕,觸發js事件
framePage.executeJavaScript("javascript:TransferListAll(‘countrylst','countrylstselected','no');SetSelectedCount(‘countrylstselected','tdcount');");
//獲取Next按鈕,觸發js事件
ScriptResult electricityScriptResult = framePage.executeJavaScript("javascript:wrapperSetCube('/ddp')");
System.out.println("正在跳轉…");
//跳轉到下一個頁面electricitySelect
HtmlPage electricitySelect = (HtmlPage) electricityScriptResult.getNewPage();
//獲得electricity選擇的iframe
HtmlPage electricityFrame = (HtmlPage) electricitySelect.getFrameByName("frmTree1″).getEnclosedPage();
//獲得選擇框
HtmlSelect seriesSelect = (HtmlSelect) electricityFrame.getElementById("countrylst");
//獲得所有的選擇框內容
List optionList = seriesSelect.getOptions();
//將指定的選項選中
optionList.get(1).setSelected(true);
//模擬點擊select按鈕 electricityFrame.executeJavaScript("javascript:TransferList('countrylst','countrylstselected','no');SetSelectedCount('countrylstselected','tdcount');");
//獲取選中后,下面的選擇框
HtmlSelect electricitySelected = (HtmlSelect) electricityFrame.getElementById("countrylstselected");
List list = electricitySelected.getOptions();
//模擬點擊Next按鈕,跳轉到選擇時間的頁面
ScriptResult timeScriptResult = electricityFrame.executeJavaScript("javascript:wrapperSetCube('/ddp')");
System.out.println("正在跳轉…");
HtmlPage timeSelectPage = (HtmlPage) timeScriptResult.getNewPage();
//獲取選中時間的選擇框
timeSelectPage = (HtmlPage) timeSelectPage.getFrameByName("frmTree1″).getEnclosedPage();
//選中所有的時間 timeSelectPage.executeJavaScript("javascript:TransferListAll('countrylst','countrylstselected','no');SetSelectedCount('countrylstselected','tdcount');");
//點擊Next按鈕
ScriptResult exportResult = timeSelectPage.executeJavaScript("javascript:wrapperSetCube('/ddp')");
System.out.println("正在跳轉…");
//轉到export頁面
HtmlPage exportPage = (HtmlPage) exportResult.getNewPage();
//點擊頁面上的Export按鈕,進入下載頁面
ScriptResult downResult = exportPage.executeJavaScript("javascript:exportData('/ddp' ,'EXT_BULK' ,'WDI_Time=51||WDI_Series=1||WDI_Ctry=244||' );");
System.out.println("正在跳轉…");
HtmlPage downLoadPage = (HtmlPage) downResult.getNewPage();
//點擊Excel圖標,開始下載
ScriptResult downLoadResult = downLoadPage.executeJavaScript("javascript:exportData('/ddp','BULKEXCEL');");
//下載Excel文件
InputStream is = downLoadResult.getNewPage().getWebResponse().getContentAsStream();
OutputStream fos = new FileOutputStream("d://test.xls");
byte[] buffer=new byte[1024*30];
int len=-1;
while((len=is.read(buffer))>0){
fos.write(buffer, 0, len);
}
fos.close();
fos.close();
System.out.println("Success!");
}
}
注釋:
/**HtmlUnit請求web頁面*/
WebClient wc = new WebClient();
wc.getOptions().setJavaScriptEnabled(true); //啟用JS解釋器,默認為true
wc.getOptions().setCssEnabled(false); //禁用css支持
wc.getOptions().setThrowExceptionOnScriptError(false); //js運行錯誤時,是否拋出異常
wc.getOptions().setTimeout(10000); //設置連接超時時間 ,這里是10S。如果為0,則無限期等待
HtmlPage page = wc.getPage("http://cq.qq.com/baoliao/detail.htm?294064");
String pageXml = page.asXml(); //以xml的形式獲取響應文本
