詳細的介紹已經有很多前輩總結,引用一下該篇文章:https://blog.csdn.net/zhuwukai/article/details/78644484
下面是一個代碼的示例:
package com.http.client; import java.io.IOException; import org.apache.http.HttpHost; import org.apache.http.HttpResponse; import org.apache.http.client.ClientProtocolException; import org.apache.http.client.HttpClient; import org.apache.http.client.methods.HttpGet; import org.apache.http.conn.params.ConnRouteParams; import org.apache.http.impl.client.DefaultHttpClient; import org.apache.http.params.CoreConnectionPNames; import org.apache.http.util.EntityUtils; import org.apache.log4j.Logger; /** * * @author oo * @date 2018-04-04 */ public class MyHttpClient { private static Logger logger = Logger.getLogger(MyHttpClient.class); /** * 需求:使用httpclient 爬取 網站數據 * * @param args */ public static void main(String[] args) { // 創建HttpClient 對象 HttpClient hclient = new DefaultHttpClient(); // 設置響應時間 傳輸源碼時間 代理服務器(設置代理服務器的目的是:防止爬數據被封ip) hclient.getParams().setParameter(CoreConnectionPNames.CONNECTION_TIMEOUT, 20000) .setParameter(CoreConnectionPNames.SO_TIMEOUT, 20000) .setParameter(ConnRouteParams.DEFAULT_PROXY, new HttpHost("111.155.116.237", 8123)); HttpGet hGet = new HttpGet("http://www.itcast.cn/"); String content = ""; try { // 向網站發送請求,獲取網頁源碼 HttpResponse execute = hclient.execute(hGet); // EntityUtils工具類把網頁實體轉換成字符串 content = EntityUtils.toString(execute.getEntity(), "utf-8"); } catch (ClientProtocolException e) { e.printStackTrace(); logger.error("********ClientProtocolException" + e); } catch (IOException e) { e.printStackTrace(); logger.error("********IOException" + e); } System.out.println(content); } }
使用Jsoup進行請求:
package com.http.client; import java.io.IOException; import org.apache.log4j.Logger; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; public class MyJsoup { private static Logger logger = Logger.getLogger(MyJsoup.class); public static void main(String[] args) { try { // 使用jsoup 發送請求 Document document = Jsoup.connect("http://www.itcast.cn").get(); // System.out.println(document); Elements elements = document.getElementsByTag("a"); String val = elements.text(); System.out.println(val); for (Element element : elements) { System.out.println(element.text()+":"+element.attr("href")); } } catch (IOException e) { e.printStackTrace(); logger.error("***********IOException: 連接失敗" + e); } } }
HttpClient 結合Jsoup:
1 package com.http.client; 2 3 import java.io.IOException; 4 5 import org.apache.http.HttpResponse; 6 import org.apache.http.client.ClientProtocolException; 7 import org.apache.http.client.HttpClient; 8 import org.apache.http.client.methods.HttpGet; 9 import org.apache.http.impl.client.DefaultHttpClient; 10 import org.apache.http.util.EntityUtils; 11 import org.jsoup.Jsoup; 12 import org.jsoup.nodes.Document; 13 import org.jsoup.nodes.Element; 14 import org.jsoup.select.Elements; 15 16 public class HttpCLientAndJsoup { 17 18 public static void main(String[] args) throws ClientProtocolException, IOException { 19 // 創建HttpClient對象 20 HttpClient hClient = new DefaultHttpClient(); 21 // 爬蟲URL大部分都是get請求,創建get請求對象 22 HttpGet hget = new HttpGet("http://www.itcast.cn/"); 23 // 向網站發送請求,獲取網頁源碼 24 HttpResponse response = hClient.execute(hget); 25 // EntityUtils工具類把網頁實體轉換成字符串 26 String content = EntityUtils.toString(response.getEntity(), "utf-8"); 27 // Jsoup負責解析網頁 28 Document doc = Jsoup.parse(content); 29 // 使用元素選擇器選擇網頁內容 30 Elements elements = doc.select("div.salary_con li"); 31 // System.out.println(elements.text()); 32 for (Element element : elements) { 33 String text = element.text(); 34 System.out.println(text); 35 } 36 37 } 38 39 }