1.建立http連接返回html頁面:
public static String doRequest(String url,String method) { //創建httpClient對象 CloseableHttpClient client=HttpClientBuilder.create().build(); URIBuilder uriBuilder= null; HttpUriRequest uri=null; CloseableHttpResponse response= null; //獲取相應對象 String html=""; //存放響應信息 try { uriBuilder = new URIBuilder(url); switch (method){ case "POST": uri=new HttpPost(uriBuilder.build()); break; case "PUT": uri=new HttpPut(uriBuilder.build()); break; case "DELETE": uri=new HttpDelete(uriBuilder.build()); break; default: uri=new HttpGet(uriBuilder.build()); break; } response = client.execute(uri); if (response.getStatusLine().getStatusCode() == HttpStatus.SC_OK) { // 返回 200 表示成功 html = EntityUtils.toString(response.getEntity(), "utf-8"); // 獲取服務器響應實體的內容 } } catch (URISyntaxException e) { e.printStackTrace(); }catch (IOException e) { e.printStackTrace(); }finally { if (response != null) { try { response.close(); } catch (IOException e) { e.printStackTrace(); } } } return html; }
2.解析頁面獲取想要的數據:
//解析 篩選網頁所需信息 public static void analysisHtml(String html){ //第一步,將字符內容解析成一個Document類 Document document = Jsoup.parse(html); //第二步,根據我們需要得到的標簽,選擇提取相應標簽的內容 Elements elements = document.select("div[class=leftbox]").select("div[class=pr0]"); String name=""; String address=""; for(Element e : elements){ name=e.getElementsByClass("pr2").select("ul").select("li").select("a").first().text(); address=e.getElementsByClass("pr4").text(); Company company=new Company(name,address); companyList.add(company); } }
3.啟動方法啟動:
public static List<Company> start() { String url="http://www.chinawj.com.cn/qiye/wujinjidian/c1_1_0_"; for(int i=1;i<=10;i++){ System.out.println("開始爬取數據[頁碼:"+i+"]"); analysisHtml(doRequest(url+i+".html","GET")); } System.out.println(companyList); return companyList; }