|
編寫不易,轉載請注明(http://shihlei.iteye.com/blog/2067707)! 一 概述 ? ? ? HttpClient適合處理靜態資源,網絡爬蟲等類似應用很大程度需要處理動態網頁(內容有js填充,如百度圖片,body里基本沒有數據,碰到最麻煩的是新浪微博列表頁)。將網頁下載后,結合JS和Dom模型還原網頁,我目前還未攻破,但在下載層還原網頁,HtmlUnit是一種解決方案,雖然對JS的支持還是不完美。 ? ? HtmlUnit其實是自動化測試工具,集成了下載(HttpClient),Dom(NekoHtml),驅動JS(Rhino)。有一定的網頁渲染能力,由於會驅動Dom,會消耗些CPU,內存。 ? ?本文描述HTMLUnit請求響應,設置cookies,設置代理,驅動JS等方法。 ? 二 版本 ? <dependency> <groupId>net.sourceforge.htmlunit</groupId> <artifactId>htmlunit</artifactId> <version>2.14</version> </dependency> ? 三 典型功能 ? 1) 打開google搜索百度 ? /**
* 打開google 搜索百度
*
* @param args
* @throws Exception
*/
public static void main(String[] args) throws Exception {
String url = "http://www.google.com.hk";
final WebClient webClient = new WebClient();
HtmlPage htmlPage = webClient.getPage(url);
// HtmlUnit dom模型
// 獲取表單 ,獲得form標簽name屬性=f
HtmlForm form = htmlPage.getFormByName("f");
// 獲取輸入框, 獲取 input標簽 ,name屬性=q
HtmlTextInput text = form.getInputByName("q");
// 搜索百度
text.setText("baidu");
// 獲取提交按鈕
HtmlSubmitInput button = form.getInputByName("btnG");
// 提交表單
HtmlPage listPage = button.click();
System.out.println(listPage.asXml());
webClient.closeAllWindows();
}
? 2)獲取動態頁面 ? /**
* 獲取百度圖片js后的內容
*
* @throws Exception
*/
public void demo2() throws Exception {
String url = "http://image.baidu.com/i?tn=baiduimage&ipn=r&ct=201326592&cl=2&lm=-1&st=-1&fm=result&fr=&sf=1&fmq=1400328281672_R&pv=&ic=0&nc=1&z=&se=1&showtab=0&fb=0&width=&height=&face=0&istype=2&ie=utf-8&word=html";
final WebClient webClient = new WebClient();
// 1 啟動JS
webClient.getOptions().setJavaScriptEnabled(true);
// 2 禁用Css,可避免自動二次請求CSS進行渲染
webClient.getOptions().setCssEnabled(false);
// 3 啟動客戶端重定向
webClient.getOptions().setRedirectEnabled(true);
// 4 js運行錯誤時,是否拋出異常
webClient.getOptions().setThrowExceptionOnScriptError(false);
// 5 設置超時
webClient.getOptions().setTimeout(50000);
HtmlPage htmlPage = webClient.getPage(url);
// 等待JS驅動dom完成獲得還原后的網頁
webClient.waitForBackgroundJavaScript(10000);
// 網頁內容
System.out.println(htmlPage.asXml());
webClient.closeAllWindows();
}
? 四 樣例 (1)請求響應 ? /**
* Get請求
* @param url
* @return
* @throws Exception
*/
public static byte[] sendGetRequest(String url) throws Exception{
WebClient webClient = new WebClient();
WebRequest webRequest = new WebRequest(new URL(url));
webRequest.setHttpMethod(HttpMethod.GET);
return sendRequest(webClient,webRequest);
}
/**
* Post 請求
*
* @param url
* @param params
* @return
* @throws Exception
*/
public static byte[] sendPostRequest(String url,Map<String,String> params) throws Exception{
WebClient webClient = new WebClient();
WebRequest webRequest = new WebRequest(new URL(url));
webRequest.setHttpMethod(HttpMethod.POST);
if (params != null && params.size() > 0) {
for (Entry<String, String> param : params.entrySet()) {
webRequest.getRequestParameters().add(new NameValuePair(param.getKey(), param.getValue()));
}
}
return sendRequest(webClient,webRequest);
}
//底層請求
private static byte[] sendRequest(WebClient webClient,WebRequest webRequest) throws Exception{
byte[] responseContent = null;
Page page = webClient.getPage(webRequest);
WebResponse webResponse = page.getWebResponse();
int status = webResponse.getStatusCode();
System.out.println("Charset : " + webResponse.getContentCharset());
System.out.println("ContentType : " + webResponse.getContentType());
// 讀取數據內容
if (status==200) {
if (page.isHtmlPage()) {
// 等待JS執行完成,包括遠程JS文件請求,Dom處理
webClient.waitForBackgroundJavaScript(10000);
// 使用JS還原網頁
responseContent = ((HtmlPage) page).asXml().getBytes();
} else {
InputStream bodyStream = webResponse.getContentAsStream();
responseContent = ByteStreams.toByteArray(bodyStream);
bodyStream.close();
}
}
// 關閉響應流
webResponse.cleanUp();
return responseContent;
}
? (2)配置JS,CSS,超時,重定向 ? private void configWebClient(WebClient webClient) {
// 設置webClient的相關參數
// 1 啟動JS
webClient.getOptions().setJavaScriptEnabled(true);
// 2 禁用Css,可避免自動二次請求CSS進行渲染
webClient.getOptions().setCssEnabled(false);
// 3 啟動客戶端重定向
webClient.getOptions().setRedirectEnabled(true);
// 4 js運行錯誤時,是否拋出異常
webClient.getOptions().setThrowExceptionOnScriptError(false);
// 5 設置超時
webClient.getOptions().setTimeout(timeout);
}
? (3)代理 ? private void setProxy(WebClient webClient,HttpProxy proxy) {
ProxyConfig proxyConfig = webClient.getOptions().getProxyConfig();
proxyConfig.setProxyHost(proxy.getHost());
proxyConfig.setProxyPort(proxy.getPort());
DefaultCredentialsProvider credentialsProvider = (DefaultCredentialsProvider) webClient
.getCredentialsProvider();
credentialsProvider.addCredentials(proxy.getUser(), proxy.getPassword());
}
? ?輔助類: ? package x.http.core;
/**
* Http代理
*
* @author shilei
*
*/
public class HttpProxy {
private String proxy = "http";
private String host;
private int port;
private String user;
private String password;
public String getProxy() {
return proxy;
}
public void setProxy(String proxy) {
this.proxy = proxy;
}
public String getHost() {
return host;
}
public void setHost(String host) {
this.host = host;
}
public int getPort() {
return port;
}
public void setPort(int port) {
this.port = port;
}
public String getUser() {
return user;
}
public void setUser(String user) {
this.user = user;
}
public String getPassword() {
return password;
}
public void setPassword(String password) {
this.password = password;
}
}
? (4)Cookies:可以用於認證數據設置 1)設置Cookies ? private void setCookies(WebClient webClient,String domain, Map<String, String> cookies) {
if (cookies != null && cookies.size() > 0) {
webClient.getCookieManager().setCookiesEnabled(true);// enable
// cookies
for (Entry<String, String> c : cookies.entrySet()) {
Cookie cookie = new Cookie(domain, c.getKey(), c.getValue());
webClient.getCookieManager().addCookie(cookie);
}
}
}
? 2)獲取響應Cookies ? private Map<String, String> getResponseCookies(WebClient webClient) {
Set<Cookie> cookies = webClient.getCookieManager().getCookies();
Map<String, String> responseCookies = Maps.newHashMap();
for (Cookie c : cookies) {
responseCookies.put(c.getName(), c.getValue());
}
return responseCookies;
}
? 3)刪除所有Cookies ? /**
* 清除所有cookie
*/
public void clearCookies(WebClient webClient) {
webClient.getCookieManager().clearCookies();
}
? ?(5)驅動JS: 可實現自動化流程,如驅動表單提交,獲取表單提交后的頁面 如登錄后頁面: public void doWeb(Page page) {
if (page instanceof HtmlPage) {
StringBuilder js = new StringBuilder();
js.append("document.getElementsByName('username')[1].value='").append(WeiboAccount.USERNAME)
.append("';");
js.append("document.getElementsByName('password')[1].value='").append(WeiboAccount.PASSWORD)
.append("';");
js.append("document.getElementsByClassName('W_btn_g')[1].click();");
HtmlPage htmlPage = (HtmlPage) page;
htmlPage.executeJavaScript(js.toString());
}
}
? ?附錄:完整代碼 package x.http.simple.htmlunit;
import java.io.IOException;
import java.io.InputStream;
import java.net.URL;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
import x.http.core.HttpProxy;
import com.gargoylesoftware.htmlunit.DefaultCredentialsProvider;
import com.gargoylesoftware.htmlunit.HttpMethod;
import com.gargoylesoftware.htmlunit.Page;
import com.gargoylesoftware.htmlunit.ProxyConfig;
import com.gargoylesoftware.htmlunit.WebClient;
import com.gargoylesoftware.htmlunit.WebRequest;
import com.gargoylesoftware.htmlunit.WebResponse;
import com.gargoylesoftware.htmlunit.html.HtmlForm;
import com.gargoylesoftware.htmlunit.html.HtmlPage;
import com.gargoylesoftware.htmlunit.html.HtmlSubmitInput;
import com.gargoylesoftware.htmlunit.html.HtmlTextInput;
import com.gargoylesoftware.htmlunit.util.Cookie;
import com.gargoylesoftware.htmlunit.util.NameValuePair;
import com.google.common.collect.Maps;
import com.google.common.io.ByteStreams;
public class HtmlUnitDemo {
private WebClient webClient = null;
private int timeout = 50000;
public HtmlUnitDemo() {
this(null);
}
/**
* Get請求
*
* @param url
* @return
* @throws Exception
*/
public byte[] sendGetRequest(String url) throws Exception {
WebRequest webRequest = new WebRequest(new URL(url));
webRequest.setHttpMethod(HttpMethod.GET);
return sendRequest(webRequest);
}
/**
* Post 請求
*
* @param url
* @param params
* @return
* @throws Exception
*/
public byte[] sendPostRequest(String url, Map<String, String> params) throws Exception {
WebRequest webRequest = new WebRequest(new URL(url));
webRequest.setHttpMethod(HttpMethod.POST);
if (params != null && params.size() > 0) {
for (Entry<String, String> param : params.entrySet()) {
webRequest.getRequestParameters().add(new NameValuePair(param.getKey(), param.getValue()));
}
}
return sendRequest(webRequest);
}
// 底層請求
private byte[] sendRequest(WebRequest webRequest) throws Exception {
byte[] responseContent = null;
Page page = webClient.getPage(webRequest);
WebResponse webResponse = page.getWebResponse();
int status = webResponse.getStatusCode();
System.out.println("Charset : " + webResponse.getContentCharset());
System.out.println("ContentType : " + webResponse.getContentType());
// 讀取數據內容
if (status == 200) {
if (page.isHtmlPage()) {
// 等待JS執行完成
webClient.waitForBackgroundJavaScript(100000);
responseContent = ((HtmlPage) page).asXml().getBytes();
} else {
InputStream bodyStream = webResponse.getContentAsStream();
responseContent = ByteStreams.toByteArray(bodyStream);
bodyStream.close();
}
}
// 關閉響應流
webResponse.cleanUp();
return responseContent;
}
public HtmlUnitDemo(HttpProxy proxy) {
webClient = new WebClient();
configWebClient();
// 設置代理
if (proxy != null) {
setProxy(proxy);
}
}
private void configWebClient() {
// 設置webClient的相關參數
// 1 啟動JS
webClient.getOptions().setJavaScriptEnabled(true);
// 2 禁用Css,可避免自動二次請求CSS進行渲染
webClient.getOptions().setCssEnabled(false);
// 3 啟動客戶端重定向
webClient.getOptions().setRedirectEnabled(true);
// 4 js運行錯誤時,是否拋出異常
webClient.getOptions().setThrowExceptionOnScriptError(false);
// 5 設置超時
webClient.getOptions().setTimeout(timeout);
}
private void setProxy(HttpProxy proxy) {
ProxyConfig proxyConfig = webClient.getOptions().getProxyConfig();
proxyConfig.setProxyHost(proxy.getHost());
proxyConfig.setProxyPort(proxy.getPort());
DefaultCredentialsProvider credentialsProvider = (DefaultCredentialsProvider) webClient
.getCredentialsProvider();
credentialsProvider.addCredentials(proxy.getUser(), proxy.getPassword());
}
@SuppressWarnings("unused")
private Map<String, String> getResponseCookies() {
Set<Cookie> cookies = webClient.getCookieManager().getCookies();
Map<String, String> responseCookies = Maps.newHashMap();
for (Cookie c : cookies) {
responseCookies.put(c.getName(), c.getValue());
}
return responseCookies;
}
@SuppressWarnings("unused")
private void setCookies(String domain, Map<String, String> cookies) {
if (cookies != null && cookies.size() > 0) {
webClient.getCookieManager().setCookiesEnabled(true);// enable
// cookies
for (Entry<String, String> c : cookies.entrySet()) {
Cookie cookie = new Cookie(domain, c.getKey(), c.getValue());
webClient.getCookieManager().addCookie(cookie);
System.out.println("Set Cookies : " + c.getKey() + " | " + c.getValue());
}
}
}
/**
* 清除所有cookie
*/
public void clearCookies() {
webClient.getCookieManager().clearCookies();
}
public void shutdown() throws IOException {
webClient.closeAllWindows();
}
/**
* 打開google 搜索百度
*
* @param args
* @throws Exception
*/
public void demo() throws Exception{
String url = "http://www.google.com.hk";
final WebClient webClient = new WebClient();
HtmlPage htmlPage = webClient.getPage(url);
// HtmlUnit dom模型
// 獲取表單 ,獲得form標簽name屬性=f
HtmlForm form = htmlPage.getFormByName("f");
// 獲取輸入框, 獲取 input標簽 ,name屬性=q
HtmlTextInput text = form.getInputByName("q");
// 搜索百度
text.setText("baidu");
// 獲取提交按鈕
HtmlSubmitInput button = form.getInputByName("btnG");
// 提交表單
HtmlPage listPage = button.click();
System.out.println(listPage.asXml());
webClient.closeAllWindows();
}
/**
* 打開google 搜索百度
*
* @param args
* @throws Exception
*/
public static void main(String[] args) throws Exception {
String url = "http://www.google.com.hk";
HtmlUnitDemo htmlUnit = new HtmlUnitDemo();
byte[] getResponse = htmlUnit.sendGetRequest(url);
System.out.println("Get Body : " + new String(getResponse, "utf-8"));
byte[] postResponse = htmlUnit.sendPostRequest(url, null);
System.out.println("Get Body : " + new String(postResponse, "utf-8"));
htmlUnit.shutdown();
}
}
|
