新版HttpClient4.2與之前的3.x版本有了很大變化,建議從http://hc.apache.org/處以得到最新的信息。
- 關於HttpCore與HttpClient:HttpCore是位於HTTP傳輸組件的底層包,可以用來簡化HTTP客戶端與服務器端的開發。HttpClient是一個符合HTTP1.1版本,基於HttpCore類包的一個實現。它同時為客戶端認證、HTTP狀態管理、HTTP連接管理提供了可重用的客戶端組件。HttpCore類包目前最新發布版本是httpcore-4.2.4;HttpClient類包的版本是httpclient-4.2.5。
了解到HttpCore包與HttpClient包的差別,在程序中就應該大致知道一些包它們存在於哪個類庫中。比如:org.apache.http包屬於HttpCore,而org.apache.http.client包屬於HttpClient。 - HttpClient的API文檔在下載的zip中已經包括;
HttpCore的API文檔可以參考:http://hc.apache.org/httpcomponents-core-4.2.x/httpcore/apidocs/index.html - HttpClient4.2需要Java 5.0及以上版本;需要支持包有(下載zip包中已經包括):
* Apache HttpComponents HttpCore
* Apache Commons Logging
* Apache Commons Codec
1. 獲取一個HTML頁面的內容,一個簡單的get應用
// 獲取一個HTML頁面的內容,一個簡單的get應用 public void grabPageHTML() throws Exception { HttpClient httpclient = new DefaultHttpClient(); HttpGet httpget = new HttpGet("http://www.baidu.com/"); HttpResponse response = httpclient.execute(httpget); HttpEntity entity = response.getEntity(); String html = EntityUtils.toString(entity, "GBK"); // releaseConnection等同於reset,作用是重置request狀態位,為下次使用做好准備。 // 其實就是用一個HttpGet獲取多個頁面的情況下有效果;否則可以忽略此方法。 httpget.releaseConnection(); System.out.println(html); }
2. 下載一個文件到本地(本示范中為一個驗證碼圖片)
// 下載一個文件到本地(本示范中為一個驗證碼圖片) public void downloadFile() throws Exception { String url = "http://www.lashou.com/account/captcha"; String destfilename = "D:\\TDDOWNLOAD\\yz.png"; HttpClient httpclient = new DefaultHttpClient(); HttpGet httpget = new HttpGet(url); File file = new File(destfilename); if (file.exists()) { file.delete(); } HttpResponse response = httpclient.execute(httpget); HttpEntity entity = response.getEntity(); InputStream in = entity.getContent(); try { FileOutputStream fout = new FileOutputStream(file); int l = -1; byte[] tmp = new byte[2048]; while ((l = in.read(tmp)) != -1) { fout.write(tmp); } fout.close(); } finally { // 在用InputStream處理HttpEntity時,切記要關閉低層流。 in.close(); } httpget.releaseConnection(); }
3. Post方法,模擬表單提交參數登錄到網站並打開會員頁面獲取內容(會話保持)
// Post方法,模擬表單提交參數登錄到網站。 // 結合了上面兩個方法:grabPageHTML/downloadFile,同時增加了Post的代碼。 public void login2Lashou() throws Exception { // 第一步:先下載驗證碼到本地 String url = "http://www.lashou.com/account/captcha"; String destfilename = "D:\\TDDOWNLOAD\\yz.png"; HttpClient httpclient = new DefaultHttpClient(); HttpGet httpget = new HttpGet(url); File file = new File(destfilename); if (file.exists()) { file.delete(); } HttpResponse response = httpclient.execute(httpget); HttpEntity entity = response.getEntity(); InputStream in = entity.getContent(); try { FileOutputStream fout = new FileOutputStream(file); int l = -1; byte[] tmp = new byte[2048]; while ((l = in.read(tmp)) != -1) { fout.write(tmp); } fout.close(); } finally { in.close(); } httpget.releaseConnection();
// 第二步:用Post方法帶若干參數嘗試登錄,需要手工輸入下載驗證碼中顯示的字母、數字 BufferedReader br = new BufferedReader(new InputStreamReader(System.in)); System.out.println("請輸入下載下來的驗證碼中顯示的數字..."); String yan = br.readLine(); HttpPost httppost = new HttpPost("http://www.lashou.com/account/login/"); List<NameValuePair> params = new ArrayList<NameValuePair>(); params.add(new BasicNameValuePair("user_id", "testuser007")); params.add(new BasicNameValuePair("pwd", "asdfg123")); params.add(new BasicNameValuePair("yan", yan)); params.add(new BasicNameValuePair("save_user", "on")); params.add(new BasicNameValuePair("save_pwd", "on")); params.add(new BasicNameValuePair("sub", "登錄")); httppost.setEntity(new UrlEncodedFormEntity(params)); response = httpclient.execute(httppost); entity = response.getEntity(); // 在這里可以用Jsoup之類的工具對返回結果進行分析,以判斷登錄是否成功 String postResult = EntityUtils.toString(entity, "GBK"); // 我們這里只是簡單的打印出當前Cookie值以判斷登錄是否成功。 List<Cookie> cookies = ((AbstractHttpClient)httpclient).getCookieStore().getCookies(); for(Cookie cookie: cookies) System.out.println(cookie); httppost.releaseConnection();
// 第三步:打開會員頁面以判斷登錄成功(未登錄用戶是打不開會員頁面的) String memberpage = "http://www.lashou.com/account/orders/"; httpget = new HttpGet(memberpage); response = httpclient.execute(httpget); // 必須是同一個HttpClient! entity = response.getEntity(); String html = EntityUtils.toString(entity, "GBK"); httpget.releaseConnection(); System.out.println(html); }
輸出:
請輸入下載下來的驗證碼中顯示的數字...
sbzq
...
[version: 0][name: login_name2][value: testuser007][domain: www.lashou.com][path: /][expiry: Mon Sep 09 10:21:19 CST 2013]
[version: 0][name: pwd2][value: 4c88a4062736c26572d3ec382868fa2b][domain: lashou.com][path: /][expiry: Mon Sep 09 10:21:19 CST 2013]
?<!doctype html>
...
4. 設置代理服務器
// 設置代理服務器 public void testProxy() throws Exception { HttpHost proxy = new HttpHost("127.0.0.1", 8888); // 方式一 HttpClient httpclient = new DefaultHttpClient(); httpclient.getParams().setParameter(ConnRoutePNames.DEFAULT_PROXY, proxy); // 方式二 HttpParams params = new BasicHttpParams(); params.setParameter(ConnRoutePNames.DEFAULT_PROXY, proxy); HttpClient httpclient1 = new DefaultHttpClient(params); }
5. 幾種常用HTTP頭的設置
// 幾種常用HTTP頭的設置 public void testBasicHeader() throws Exception { HttpParams params = new BasicHttpParams(); Collection<BasicHeader> collection = new ArrayList<BasicHeader>(); collection.add(new BasicHeader("Accept", "text/html, application/xhtml+xml, */*")); collection.add(new BasicHeader("Referer", "http://www.sina.com/")); collection.add(new BasicHeader("Accept-Language", "zh-CN")); collection.add(new BasicHeader("User-Agent", "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0)")); collection.add(new BasicHeader("Accept-Encoding", "gzip, deflate")); params.setParameter(ClientPNames.DEFAULT_HEADERS, collection); HttpClient httpclient = new DefaultHttpClient(params); // 下面內容略 }
6. 多線程編程下的線程池設置
// 多線程編程下的線程池設置(這點在需要登錄且用一個HttpClient對象抓取多個頁面的情況下特別有用) public void testConnectionManager() throws Exception { // 連接池設置 SchemeRegistry schemeRegistry = new SchemeRegistry(); schemeRegistry.register(new Scheme("http", 80, PlainSocketFactory.getSocketFactory())); schemeRegistry.register(new Scheme("https", 443, SSLSocketFactory.getSocketFactory())); PoolingClientConnectionManager cm = new PoolingClientConnectionManager(schemeRegistry); cm.setMaxTotal(200); // 連接池里的最大連接數 cm.setDefaultMaxPerRoute(20); // 每個路由的默認最大連接數 HttpHost localhost = new HttpHost("locahost", 80); // 可以針對某特定網站指定最大連接數 cm.setMaxPerRoute(new HttpRoute(localhost), 30); // 其它設置 HttpParams params = new BasicHttpParams(); params.setParameter(ClientPNames.COOKIE_POLICY, CookiePolicy.BROWSER_COMPATIBILITY); HttpClient httpclient = new DefaultHttpClient(cm, params); // 下面內容略 }
7. 測試HTTP上下文對象(HttpContext)
// 測試HTTP上下文對象(HttpContext) public void testContext() throws Exception { // 請求一個頁面,然后解析各上下文對象 DefaultHttpClient httpclient = new DefaultHttpClient(); HttpContext localContext = new BasicHttpContext(); HttpGet httpget = new HttpGet("http://www.baidu.com/"); HttpResponse response = httpclient.execute(httpget, localContext); // the actual connection to the target server. HttpConnection conn = (HttpConnection) localContext.getAttribute( ExecutionContext.HTTP_CONNECTION); System.out.println("Socket timeout: " + conn.getSocketTimeout()); // the connection target HttpHost target = (HttpHost) localContext.getAttribute( ExecutionContext.HTTP_TARGET_HOST); System.out.println("Final target: " + target); // the connection proxy, if used HttpHost proxy = (HttpHost) localContext .getAttribute(ExecutionContext.HTTP_PROXY_HOST); if (proxy != null) System.out.println("Proxy host/port: " + proxy.getHostName() + "/" + proxy.getPort()); // the actual HTTP request HttpRequest request = (HttpRequest) localContext .getAttribute(ExecutionContext.HTTP_REQUEST); System.out.println("HTTP version: " + request.getProtocolVersion()); Header[] headers = request.getAllHeaders(); System.out.println("HTTP Headers: "); for (Header header : headers) { System.out.println("\t" + header.getName() + ": " + header.getValue()); } System.out.println("HTTP URI: " + request.getRequestLine().getUri()); // the actual HTTP response response = (HttpResponse) localContext .getAttribute(ExecutionContext.HTTP_RESPONSE); HttpEntity entity = response.getEntity(); if (entity != null) { System.out.println("Content Encoding:" + entity.getContentEncoding()); System.out.println("Content Type:" + entity.getContentType()); } // the flag indicating whether the actual request has been fully transmitted to the connection target. System.out.println("Sent flag: " + localContext.getAttribute(ExecutionContext.HTTP_REQ_SENT));
// 如果沒有用到返回entity中的內容,那么要把它消費掉,以保證底層的資源得以釋放。 entity = response.getEntity(); EntityUtils.consume(entity); }
輸出:
Socket timeout: 0
Final target: http://www.baidu.com
HTTP version: HTTP/1.1
HTTP Headers:
Host: www.baidu.com
Connection: Keep-Alive
User-Agent: Apache-HttpClient/4.2.5 (java 1.5)
HTTP URI: /
Content Encoding:null
Content Type:Content-Type: text/html;charset=utf-8
Sent flag: true
8. 完整的代碼

package com.clzhang.sample.net; import java.io.BufferedReader; import java.io.File; import java.io.FileOutputStream; import java.io.InputStream; import java.io.InputStreamReader; import java.util.ArrayList; import java.util.Collection; import java.util.List; import org.apache.http.Header; import org.apache.http.HttpConnection; import org.apache.http.HttpEntity; import org.apache.http.HttpHost; import org.apache.http.HttpRequest; import org.apache.http.HttpResponse; import org.apache.http.NameValuePair; import org.apache.http.client.HttpClient; import org.apache.http.client.entity.UrlEncodedFormEntity; import org.apache.http.client.methods.HttpGet; import org.apache.http.client.methods.HttpPost; import org.apache.http.client.params.ClientPNames; import org.apache.http.client.params.CookiePolicy; import org.apache.http.conn.params.ConnRoutePNames; import org.apache.http.conn.routing.HttpRoute; import org.apache.http.conn.scheme.PlainSocketFactory; import org.apache.http.conn.scheme.Scheme; import org.apache.http.conn.scheme.SchemeRegistry; import org.apache.http.conn.ssl.SSLSocketFactory; import org.apache.http.cookie.Cookie; import org.apache.http.impl.client.AbstractHttpClient; import org.apache.http.impl.client.DefaultHttpClient; import org.apache.http.impl.conn.PoolingClientConnectionManager; import org.apache.http.message.BasicHeader; import org.apache.http.message.BasicNameValuePair; import org.apache.http.params.BasicHttpParams; import org.apache.http.params.HttpParams; import org.apache.http.protocol.BasicHttpContext; import org.apache.http.protocol.ExecutionContext; import org.apache.http.protocol.HttpContext; import org.apache.http.util.EntityUtils; public class HttpClientSample1 { // 獲取一個HTML頁面的內容,一個簡單的get應用 public void grabPageHTML() throws Exception { HttpClient httpclient = new DefaultHttpClient(); HttpGet httpget = new HttpGet("http://www.baidu.com/"); HttpResponse response = httpclient.execute(httpget); HttpEntity entity = response.getEntity(); String html = EntityUtils.toString(entity, "GBK"); // releaseConnection等同於reset,作用是重置request狀態位,為下次使用做好准備。 // 其實就是用一個HttpGet獲取多個頁面的情況下有效果;否則可以忽略此方法。 httpget.releaseConnection(); System.out.println(html); } // 下載一個文件到本地(本示范中為一個驗證碼圖片) public void downloadFile() throws Exception { String url = "http://www.lashou.com/account/captcha"; String destfilename = "D:\\TDDOWNLOAD\\yz.png"; HttpClient httpclient = new DefaultHttpClient(); HttpGet httpget = new HttpGet(url); File file = new File(destfilename); if (file.exists()) { file.delete(); } HttpResponse response = httpclient.execute(httpget); HttpEntity entity = response.getEntity(); InputStream in = entity.getContent(); try { FileOutputStream fout = new FileOutputStream(file); int l = -1; byte[] tmp = new byte[2048]; while ((l = in.read(tmp)) != -1) { fout.write(tmp); } fout.close(); } finally { // 在用InputStream處理HttpEntity時,切記要關閉低層流。 in.close(); } httpget.releaseConnection(); } // Post方法,模擬表單提交參數登錄到網站。 // 結合了上面兩個方法:grabPageHTML/downloadFile,同時增加了Post的代碼。 public void login2Lashou() throws Exception { // 第一步:先下載驗證碼到本地 String url = "http://www.lashou.com/account/captcha"; String destfilename = "D:\\TDDOWNLOAD\\yz.png"; HttpClient httpclient = new DefaultHttpClient(); HttpGet httpget = new HttpGet(url); File file = new File(destfilename); if (file.exists()) { file.delete(); } HttpResponse response = httpclient.execute(httpget); HttpEntity entity = response.getEntity(); InputStream in = entity.getContent(); try { FileOutputStream fout = new FileOutputStream(file); int l = -1; byte[] tmp = new byte[2048]; while ((l = in.read(tmp)) != -1) { fout.write(tmp); } fout.close(); } finally { in.close(); } httpget.releaseConnection(); // 第二步:用Post方法帶若干參數嘗試登錄,需要手工輸入下載驗證碼中顯示的字母、數字 BufferedReader br = new BufferedReader(new InputStreamReader(System.in)); System.out.println("請輸入下載下來的驗證碼中顯示的數字..."); String yan = br.readLine(); HttpPost httppost = new HttpPost("http://www.lashou.com/account/login/"); List<NameValuePair> params = new ArrayList<NameValuePair>(); params.add(new BasicNameValuePair("user_id", "testuser007")); params.add(new BasicNameValuePair("pwd", "asdfg123")); params.add(new BasicNameValuePair("yan", yan)); params.add(new BasicNameValuePair("save_user", "on")); params.add(new BasicNameValuePair("save_pwd", "on")); params.add(new BasicNameValuePair("sub", "登錄")); httppost.setEntity(new UrlEncodedFormEntity(params)); response = httpclient.execute(httppost); entity = response.getEntity(); // 在這里可以用Jsoup之類的工具對返回結果進行分析,以判斷登錄是否成功 String postResult = EntityUtils.toString(entity, "GBK"); // 我們這里只是簡單的打印出當前Cookie值以判斷登錄是否成功。 List<Cookie> cookies = ((AbstractHttpClient)httpclient).getCookieStore().getCookies(); for(Cookie cookie: cookies) System.out.println(cookie); httppost.releaseConnection(); // 第三步:打開會員頁面以判斷登錄成功(未登錄用戶是打不開會員頁面的) String memberpage = "http://www.lashou.com/account/orders/"; httpget = new HttpGet(memberpage); response = httpclient.execute(httpget); // 必須是同一個HttpClient! entity = response.getEntity(); String html = EntityUtils.toString(entity, "GBK"); httpget.releaseConnection(); System.out.println(html); } // 設置代理服務器 public void testProxy() throws Exception { HttpHost proxy = new HttpHost("127.0.0.1", 8888); // 方式一 HttpClient httpclient = new DefaultHttpClient(); httpclient.getParams().setParameter(ConnRoutePNames.DEFAULT_PROXY, proxy); // 方式二 HttpParams params = new BasicHttpParams(); params.setParameter(ConnRoutePNames.DEFAULT_PROXY, proxy); HttpClient httpclient1 = new DefaultHttpClient(params); } // 幾種常用HTTP頭的設置 public void testBasicHeader() throws Exception { HttpParams params = new BasicHttpParams(); Collection<BasicHeader> collection = new ArrayList<BasicHeader>(); collection.add(new BasicHeader("Accept", "text/html, application/xhtml+xml, */*")); collection.add(new BasicHeader("Referer", "http://www.sina.com/")); collection.add(new BasicHeader("Accept-Language", "zh-CN")); collection.add(new BasicHeader("User-Agent", "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0)")); collection.add(new BasicHeader("Accept-Encoding", "gzip, deflate")); params.setParameter(ClientPNames.DEFAULT_HEADERS, collection); HttpClient httpclient = new DefaultHttpClient(params); // 下面內容略 } // 多線程編程下的線程池設置(這點在需要登錄且用一個HttpClient對象抓取多個頁面的情況下特別有用) public void testConnectionManager() throws Exception { // 連接池設置 SchemeRegistry schemeRegistry = new SchemeRegistry(); schemeRegistry.register(new Scheme("http", 80, PlainSocketFactory.getSocketFactory())); schemeRegistry.register(new Scheme("https", 443, SSLSocketFactory.getSocketFactory())); PoolingClientConnectionManager cm = new PoolingClientConnectionManager(schemeRegistry); cm.setMaxTotal(200); // 連接池里的最大連接數 cm.setDefaultMaxPerRoute(20); // 每個路由的默認最大連接數 HttpHost localhost = new HttpHost("locahost", 80); // 可以針對某特定網站指定最大連接數 cm.setMaxPerRoute(new HttpRoute(localhost), 30); // 其它設置 HttpParams params = new BasicHttpParams(); params.setParameter(ClientPNames.COOKIE_POLICY, CookiePolicy.BROWSER_COMPATIBILITY); HttpClient httpclient = new DefaultHttpClient(cm, params); // 下面內容略 } // 測試HTTP上下文對象(HttpContext) public void testContext() throws Exception { // 請求一個頁面,然后解析各上下文對象 DefaultHttpClient httpclient = new DefaultHttpClient(); HttpContext localContext = new BasicHttpContext(); HttpGet httpget = new HttpGet("http://www.baidu.com/"); HttpResponse response = httpclient.execute(httpget, localContext); // the actual connection to the target server. HttpConnection conn = (HttpConnection) localContext.getAttribute( ExecutionContext.HTTP_CONNECTION); System.out.println("Socket timeout: " + conn.getSocketTimeout()); // the connection target HttpHost target = (HttpHost) localContext.getAttribute( ExecutionContext.HTTP_TARGET_HOST); System.out.println("Final target: " + target); // the connection proxy, if used HttpHost proxy = (HttpHost) localContext .getAttribute(ExecutionContext.HTTP_PROXY_HOST); if (proxy != null) System.out.println("Proxy host/port: " + proxy.getHostName() + "/" + proxy.getPort()); // the actual HTTP request HttpRequest request = (HttpRequest) localContext .getAttribute(ExecutionContext.HTTP_REQUEST); System.out.println("HTTP version: " + request.getProtocolVersion()); Header[] headers = request.getAllHeaders(); System.out.println("HTTP Headers: "); for (Header header : headers) { System.out.println("\t" + header.getName() + ": " + header.getValue()); } System.out.println("HTTP URI: " + request.getRequestLine().getUri()); // the actual HTTP response response = (HttpResponse) localContext .getAttribute(ExecutionContext.HTTP_RESPONSE); HttpEntity entity = response.getEntity(); if (entity != null) { System.out.println("Content Encoding:" + entity.getContentEncoding()); System.out.println("Content Type:" + entity.getContentType()); } // the flag indicating whether the actual request has been fully transmitted to the connection target. System.out.println("Sent flag: " + localContext.getAttribute(ExecutionContext.HTTP_REQ_SENT)); // 如果沒有用到返回entity中的內容,那么要把它消費掉,以保證底層的資源得以釋放。 entity = response.getEntity(); EntityUtils.consume(entity); } public static void main(String[] args) throws Exception { HttpClientSample1 ins = new HttpClientSample1(); // ins.grabPageHTML(); // ins.downloadFile(); ins.login2Lashou(); // ins.testContext(); } }