編寫不易,轉載請注明(http://shihlei.iteye.com/blog/2067707)! 一 概述 ? ? ? HttpClient適合處理靜態資源,網絡爬蟲等類似應用很大程度需要處理動態網頁(內容有js填充,如百度圖片,body里基本沒有數據,碰到最麻煩的是新浪微博列表頁)。將網頁下載后,結合JS和Dom模型還原網頁,我目前還未攻破,但在下載層還原網頁,HtmlUnit是一種解決方案,雖然對JS的支持還是不完美。 ? ? HtmlUnit其實是自動化測試工具,集成了下載(HttpClient),Dom(NekoHtml),驅動JS(Rhino)。有一定的網頁渲染能力,由於會驅動Dom,會消耗些CPU,內存。 ? ?本文描述HTMLUnit請求響應,設置cookies,設置代理,驅動JS等方法。 ? 二 版本 ? <dependency> <groupId>net.sourceforge.htmlunit</groupId> <artifactId>htmlunit</artifactId> <version>2.14</version> </dependency> ? 三 典型功能 ? 1) 打開google搜索百度 ? /** * 打開google 搜索百度 * * @param args * @throws Exception */ public static void main(String[] args) throws Exception { String url = "http://www.google.com.hk"; final WebClient webClient = new WebClient(); HtmlPage htmlPage = webClient.getPage(url); // HtmlUnit dom模型 // 獲取表單 ,獲得form標簽name屬性=f HtmlForm form = htmlPage.getFormByName("f"); // 獲取輸入框, 獲取 input標簽 ,name屬性=q HtmlTextInput text = form.getInputByName("q"); // 搜索百度 text.setText("baidu"); // 獲取提交按鈕 HtmlSubmitInput button = form.getInputByName("btnG"); // 提交表單 HtmlPage listPage = button.click(); System.out.println(listPage.asXml()); webClient.closeAllWindows(); } ? 2)獲取動態頁面 ? /** * 獲取百度圖片js后的內容 * * @throws Exception */ public void demo2() throws Exception { String url = "http://image.baidu.com/i?tn=baiduimage&ipn=r&ct=201326592&cl=2&lm=-1&st=-1&fm=result&fr=&sf=1&fmq=1400328281672_R&pv=&ic=0&nc=1&z=&se=1&showtab=0&fb=0&width=&height=&face=0&istype=2&ie=utf-8&word=html"; final WebClient webClient = new WebClient(); // 1 啟動JS webClient.getOptions().setJavaScriptEnabled(true); // 2 禁用Css,可避免自動二次請求CSS進行渲染 webClient.getOptions().setCssEnabled(false); // 3 啟動客戶端重定向 webClient.getOptions().setRedirectEnabled(true); // 4 js運行錯誤時,是否拋出異常 webClient.getOptions().setThrowExceptionOnScriptError(false); // 5 設置超時 webClient.getOptions().setTimeout(50000); HtmlPage htmlPage = webClient.getPage(url); // 等待JS驅動dom完成獲得還原后的網頁 webClient.waitForBackgroundJavaScript(10000); // 網頁內容 System.out.println(htmlPage.asXml()); webClient.closeAllWindows(); } ? 四 樣例 (1)請求響應 ? /** * Get請求 * @param url * @return * @throws Exception */ public static byte[] sendGetRequest(String url) throws Exception{ WebClient webClient = new WebClient(); WebRequest webRequest = new WebRequest(new URL(url)); webRequest.setHttpMethod(HttpMethod.GET); return sendRequest(webClient,webRequest); } /** * Post 請求 * * @param url * @param params * @return * @throws Exception */ public static byte[] sendPostRequest(String url,Map<String,String> params) throws Exception{ WebClient webClient = new WebClient(); WebRequest webRequest = new WebRequest(new URL(url)); webRequest.setHttpMethod(HttpMethod.POST); if (params != null && params.size() > 0) { for (Entry<String, String> param : params.entrySet()) { webRequest.getRequestParameters().add(new NameValuePair(param.getKey(), param.getValue())); } } return sendRequest(webClient,webRequest); } //底層請求 private static byte[] sendRequest(WebClient webClient,WebRequest webRequest) throws Exception{ byte[] responseContent = null; Page page = webClient.getPage(webRequest); WebResponse webResponse = page.getWebResponse(); int status = webResponse.getStatusCode(); System.out.println("Charset : " + webResponse.getContentCharset()); System.out.println("ContentType : " + webResponse.getContentType()); // 讀取數據內容 if (status==200) { if (page.isHtmlPage()) { // 等待JS執行完成,包括遠程JS文件請求,Dom處理 webClient.waitForBackgroundJavaScript(10000); // 使用JS還原網頁 responseContent = ((HtmlPage) page).asXml().getBytes(); } else { InputStream bodyStream = webResponse.getContentAsStream(); responseContent = ByteStreams.toByteArray(bodyStream); bodyStream.close(); } } // 關閉響應流 webResponse.cleanUp(); return responseContent; } ? (2)配置JS,CSS,超時,重定向 ? private void configWebClient(WebClient webClient) { // 設置webClient的相關參數 // 1 啟動JS webClient.getOptions().setJavaScriptEnabled(true); // 2 禁用Css,可避免自動二次請求CSS進行渲染 webClient.getOptions().setCssEnabled(false); // 3 啟動客戶端重定向 webClient.getOptions().setRedirectEnabled(true); // 4 js運行錯誤時,是否拋出異常 webClient.getOptions().setThrowExceptionOnScriptError(false); // 5 設置超時 webClient.getOptions().setTimeout(timeout); } ? (3)代理 ? private void setProxy(WebClient webClient,HttpProxy proxy) { ProxyConfig proxyConfig = webClient.getOptions().getProxyConfig(); proxyConfig.setProxyHost(proxy.getHost()); proxyConfig.setProxyPort(proxy.getPort()); DefaultCredentialsProvider credentialsProvider = (DefaultCredentialsProvider) webClient .getCredentialsProvider(); credentialsProvider.addCredentials(proxy.getUser(), proxy.getPassword()); } ? ?輔助類: ? package x.http.core; /** * Http代理 * * @author shilei * */ public class HttpProxy { private String proxy = "http"; private String host; private int port; private String user; private String password; public String getProxy() { return proxy; } public void setProxy(String proxy) { this.proxy = proxy; } public String getHost() { return host; } public void setHost(String host) { this.host = host; } public int getPort() { return port; } public void setPort(int port) { this.port = port; } public String getUser() { return user; } public void setUser(String user) { this.user = user; } public String getPassword() { return password; } public void setPassword(String password) { this.password = password; } } ? (4)Cookies:可以用於認證數據設置 1)設置Cookies ? private void setCookies(WebClient webClient,String domain, Map<String, String> cookies) { if (cookies != null && cookies.size() > 0) { webClient.getCookieManager().setCookiesEnabled(true);// enable // cookies for (Entry<String, String> c : cookies.entrySet()) { Cookie cookie = new Cookie(domain, c.getKey(), c.getValue()); webClient.getCookieManager().addCookie(cookie); } } } ? 2)獲取響應Cookies ? private Map<String, String> getResponseCookies(WebClient webClient) { Set<Cookie> cookies = webClient.getCookieManager().getCookies(); Map<String, String> responseCookies = Maps.newHashMap(); for (Cookie c : cookies) { responseCookies.put(c.getName(), c.getValue()); } return responseCookies; } ? 3)刪除所有Cookies ? /** * 清除所有cookie */ public void clearCookies(WebClient webClient) { webClient.getCookieManager().clearCookies(); } ? ?(5)驅動JS: 可實現自動化流程,如驅動表單提交,獲取表單提交后的頁面 如登錄后頁面: public void doWeb(Page page) { if (page instanceof HtmlPage) { StringBuilder js = new StringBuilder(); js.append("document.getElementsByName('username')[1].value='").append(WeiboAccount.USERNAME) .append("';"); js.append("document.getElementsByName('password')[1].value='").append(WeiboAccount.PASSWORD) .append("';"); js.append("document.getElementsByClassName('W_btn_g')[1].click();"); HtmlPage htmlPage = (HtmlPage) page; htmlPage.executeJavaScript(js.toString()); } } ? ?附錄:完整代碼 package x.http.simple.htmlunit; import java.io.IOException; import java.io.InputStream; import java.net.URL; import java.util.Map; import java.util.Map.Entry; import java.util.Set; import x.http.core.HttpProxy; import com.gargoylesoftware.htmlunit.DefaultCredentialsProvider; import com.gargoylesoftware.htmlunit.HttpMethod; import com.gargoylesoftware.htmlunit.Page; import com.gargoylesoftware.htmlunit.ProxyConfig; import com.gargoylesoftware.htmlunit.WebClient; import com.gargoylesoftware.htmlunit.WebRequest; import com.gargoylesoftware.htmlunit.WebResponse; import com.gargoylesoftware.htmlunit.html.HtmlForm; import com.gargoylesoftware.htmlunit.html.HtmlPage; import com.gargoylesoftware.htmlunit.html.HtmlSubmitInput; import com.gargoylesoftware.htmlunit.html.HtmlTextInput; import com.gargoylesoftware.htmlunit.util.Cookie; import com.gargoylesoftware.htmlunit.util.NameValuePair; import com.google.common.collect.Maps; import com.google.common.io.ByteStreams; public class HtmlUnitDemo { private WebClient webClient = null; private int timeout = 50000; public HtmlUnitDemo() { this(null); } /** * Get請求 * * @param url * @return * @throws Exception */ public byte[] sendGetRequest(String url) throws Exception { WebRequest webRequest = new WebRequest(new URL(url)); webRequest.setHttpMethod(HttpMethod.GET); return sendRequest(webRequest); } /** * Post 請求 * * @param url * @param params * @return * @throws Exception */ public byte[] sendPostRequest(String url, Map<String, String> params) throws Exception { WebRequest webRequest = new WebRequest(new URL(url)); webRequest.setHttpMethod(HttpMethod.POST); if (params != null && params.size() > 0) { for (Entry<String, String> param : params.entrySet()) { webRequest.getRequestParameters().add(new NameValuePair(param.getKey(), param.getValue())); } } return sendRequest(webRequest); } // 底層請求 private byte[] sendRequest(WebRequest webRequest) throws Exception { byte[] responseContent = null; Page page = webClient.getPage(webRequest); WebResponse webResponse = page.getWebResponse(); int status = webResponse.getStatusCode(); System.out.println("Charset : " + webResponse.getContentCharset()); System.out.println("ContentType : " + webResponse.getContentType()); // 讀取數據內容 if (status == 200) { if (page.isHtmlPage()) { // 等待JS執行完成 webClient.waitForBackgroundJavaScript(100000); responseContent = ((HtmlPage) page).asXml().getBytes(); } else { InputStream bodyStream = webResponse.getContentAsStream(); responseContent = ByteStreams.toByteArray(bodyStream); bodyStream.close(); } } // 關閉響應流 webResponse.cleanUp(); return responseContent; } public HtmlUnitDemo(HttpProxy proxy) { webClient = new WebClient(); configWebClient(); // 設置代理 if (proxy != null) { setProxy(proxy); } } private void configWebClient() { // 設置webClient的相關參數 // 1 啟動JS webClient.getOptions().setJavaScriptEnabled(true); // 2 禁用Css,可避免自動二次請求CSS進行渲染 webClient.getOptions().setCssEnabled(false); // 3 啟動客戶端重定向 webClient.getOptions().setRedirectEnabled(true); // 4 js運行錯誤時,是否拋出異常 webClient.getOptions().setThrowExceptionOnScriptError(false); // 5 設置超時 webClient.getOptions().setTimeout(timeout); } private void setProxy(HttpProxy proxy) { ProxyConfig proxyConfig = webClient.getOptions().getProxyConfig(); proxyConfig.setProxyHost(proxy.getHost()); proxyConfig.setProxyPort(proxy.getPort()); DefaultCredentialsProvider credentialsProvider = (DefaultCredentialsProvider) webClient .getCredentialsProvider(); credentialsProvider.addCredentials(proxy.getUser(), proxy.getPassword()); } @SuppressWarnings("unused") private Map<String, String> getResponseCookies() { Set<Cookie> cookies = webClient.getCookieManager().getCookies(); Map<String, String> responseCookies = Maps.newHashMap(); for (Cookie c : cookies) { responseCookies.put(c.getName(), c.getValue()); } return responseCookies; } @SuppressWarnings("unused") private void setCookies(String domain, Map<String, String> cookies) { if (cookies != null && cookies.size() > 0) { webClient.getCookieManager().setCookiesEnabled(true);// enable // cookies for (Entry<String, String> c : cookies.entrySet()) { Cookie cookie = new Cookie(domain, c.getKey(), c.getValue()); webClient.getCookieManager().addCookie(cookie); System.out.println("Set Cookies : " + c.getKey() + " | " + c.getValue()); } } } /** * 清除所有cookie */ public void clearCookies() { webClient.getCookieManager().clearCookies(); } public void shutdown() throws IOException { webClient.closeAllWindows(); } /** * 打開google 搜索百度 * * @param args * @throws Exception */ public void demo() throws Exception{ String url = "http://www.google.com.hk"; final WebClient webClient = new WebClient(); HtmlPage htmlPage = webClient.getPage(url); // HtmlUnit dom模型 // 獲取表單 ,獲得form標簽name屬性=f HtmlForm form = htmlPage.getFormByName("f"); // 獲取輸入框, 獲取 input標簽 ,name屬性=q HtmlTextInput text = form.getInputByName("q"); // 搜索百度 text.setText("baidu"); // 獲取提交按鈕 HtmlSubmitInput button = form.getInputByName("btnG"); // 提交表單 HtmlPage listPage = button.click(); System.out.println(listPage.asXml()); webClient.closeAllWindows(); } /** * 打開google 搜索百度 * * @param args * @throws Exception */ public static void main(String[] args) throws Exception { String url = "http://www.google.com.hk"; HtmlUnitDemo htmlUnit = new HtmlUnitDemo(); byte[] getResponse = htmlUnit.sendGetRequest(url); System.out.println("Get Body : " + new String(getResponse, "utf-8")); byte[] postResponse = htmlUnit.sendPostRequest(url, null); System.out.println("Get Body : " + new String(postResponse, "utf-8")); htmlUnit.shutdown(); } } |