一、HttpClient簡介
HttpClient是一個客戶端的HTTP通信實現庫,它不是一個瀏覽器。關於HTTP協議,可以搜索相關的資料。它設計的目的是發送與接收HTTP報文。它不會執行嵌入在頁面中JavaScript代碼,所以當需要抓取通過AJAX技術獲取實際內容的頁面時需要使用WebClient等其他開源庫。HttpClient最新版已經到第5版,但已經穩定的應該是4.5.2版本,官方網址:http://hc.apache.org/。
二、HttpClient簡單使用
HttpClient的主要用途是接收HTTP響應的內容,下面介紹HttpClient的簡單使用,抓取博客園的首頁。至於HttpClient4.5的常用API可以參考這篇文章:http://liangbizhi.github.io/httpclient-4-3-x-chapter-1/。
package com.httpclient.demo; import java.io.IOException; import java.nio.charset.Charset; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.http.HttpEntity; import org.apache.http.HttpStatus; import org.apache.http.client.ClientProtocolException; import org.apache.http.client.methods.CloseableHttpResponse; import org.apache.http.client.methods.HttpGet; import org.apache.http.entity.ContentType; import org.apache.http.impl.client.CloseableHttpClient; import org.apache.http.impl.client.HttpClients; import org.apache.http.util.EntityUtils; public class SimpleHttpClient { // 使用HttpClient獲取博客園首頁 public static void main(String[] args) throws ClientProtocolException, IOException { String targetUrl = "http://www.cnblogs.com/"; // 1.建立HttpClient對象 CloseableHttpClient client = HttpClients.createDefault(); // 2.建立Get請求 HttpGet get = new HttpGet(targetUrl); // 3.發送Get請求 CloseableHttpResponse res = client.execute(get); // 4.處理請求結果 if (res.getStatusLine().getStatusCode() == HttpStatus.SC_OK) { HttpEntity entity = res.getEntity(); ContentType contentType = ContentType.getOrDefault(entity); Charset charset = contentType.getCharset(); String mimeType = contentType.getMimeType(); // 獲取字節數組 byte[] content = EntityUtils.toByteArray(entity); if (charset == null) { // 默認編碼轉成字符串 String temp = new String(content); String regEx = "(?=<meta).*?(?<=charset=[\\'|\\\"]?)([[a-z]|[A-Z]|[0-9]|-]*)"; Pattern p = Pattern.compile(regEx, Pattern.CASE_INSENSITIVE); Matcher m = p.matcher(temp); if (m.find() && m.groupCount() == 1) { charset = Charset.forName(m.group(1)); } else { charset = Charset.forName("ISO-8859-1"); } } System.out.println(new String(content, charset)); } } }
三、HttpClient模擬登陸
HTTP協議本來是無狀態的,但為了保持會話的狀態,使用Cookie保存Session信息,當向服務器發送請求時會附加一些會話信息,從而能區分不同會話的狀態。用戶登陸過程,其實簡單而言,就是首先驗證用戶名與密碼,然后服務器生成會話信息保存到本地,最后用戶憑借會話信息能夠訪問類似用戶信息等需登陸的網頁。
HttpClient4.5通過CookieStore保存用戶的會話信息,還提供HttpClientContext保存用戶連接的信息。下面是一個使用HttpClient模擬知乎登陸的簡單案例。
package com.httpclient.demo; import java.io.IOException; import java.util.LinkedList; import java.util.List; import org.apache.http.Consts; import org.apache.http.NameValuePair; import org.apache.http.client.CookieStore; import org.apache.http.client.config.CookieSpecs; import org.apache.http.client.config.RequestConfig; import org.apache.http.client.entity.UrlEncodedFormEntity; import org.apache.http.client.methods.CloseableHttpResponse; import org.apache.http.client.methods.HttpGet; import org.apache.http.client.methods.HttpPost; import org.apache.http.client.protocol.HttpClientContext; import org.apache.http.cookie.Cookie; import org.apache.http.impl.client.BasicCookieStore; import org.apache.http.impl.client.CloseableHttpClient; import org.apache.http.impl.client.HttpClients; import org.apache.http.message.BasicNameValuePair; import org.apache.http.util.EntityUtils; /** * 模擬登陸知乎 */ public class ZhiHuTest { public static void main(String[] args) throws java.text.ParseException { String name = "username"; String password = "password" // 全局請求設置 RequestConfig globalConfig = RequestConfig.custom().setCookieSpec(CookieSpecs.STANDARD).build(); // 創建cookie store的本地實例 CookieStore cookieStore = new BasicCookieStore(); // 創建HttpClient上下文 HttpClientContext context = HttpClientContext.create(); context.setCookieStore(cookieStore); // 創建一個HttpClient CloseableHttpClient httpClient = HttpClients.custom().setDefaultRequestConfig(globalConfig) .setDefaultCookieStore(cookieStore).build(); CloseableHttpResponse res = null; // 創建本地的HTTP內容 try { try { // 創建一個get請求用來獲取必要的Cookie,如_xsrf信息 HttpGet get = new HttpGet("http://www.zhihu.com/"); res = httpClient.execute(get, context); // 獲取常用Cookie,包括_xsrf信息 System.out.println("訪問知乎首頁后的獲取的常規Cookie:==============="); for (Cookie c : cookieStore.getCookies()) { System.out.println(c.getName() + ": " + c.getValue()); } res.close(); // 構造post數據 List<NameValuePair> valuePairs = new LinkedList<NameValuePair>(); valuePairs.add(new BasicNameValuePair("email", name)); valuePairs.add(new BasicNameValuePair("password", password)); valuePairs.add(new BasicNameValuePair("remember_me", "true")); UrlEncodedFormEntity entity = new UrlEncodedFormEntity(valuePairs, Consts.UTF_8); entity.setContentType("application/x-www-form-urlencoded"); // 創建一個post請求 HttpPost post = new HttpPost("https://www.zhihu.com/login/email"); // 注入post數據 post.setEntity(entity); res = httpClient.execute(post, context); // 打印響應信息,查看是否登陸是否成功 System.out.println("打印響應信息==========="); HttpClientUtils.printResponse(res); res.close(); System.out.println("登陸成功后,新的Cookie:==============="); for (Cookie c : context.getCookieStore().getCookies()) { System.out.println(c.getName() + ": " + c.getValue()); } // 構造一個新的get請求,用來測試登錄是否成功 HttpGet newGet = new HttpGet("http://www.zhihu.com/question/following"); res = httpClient.execute(newGet, context); String content = EntityUtils.toString(res.getEntity()); System.out.println("登陸成功后訪問的頁面==============="); System.out.println(content); res.close(); } finally { httpClient.close(); } } catch (IOException e) { e.printStackTrace(); } } }