java通過Jsoup爬取網頁(入門教程)


一,導入依賴

     <!--java爬蟲-->
        <dependency>
            <groupId>org.jsoup</groupId>
            <artifactId>jsoup</artifactId>
            <version>1.10.3</version>
        </dependency>
        <!--httpclient依賴-->
        <dependency>
            <groupId>org.apache.httpcomponents</groupId>
            <artifactId>httpclient</artifactId>
        </dependency>

二,編寫demo類

注意不要導錯包了,是org.jsoup.nodes下面的

package com.taotao.entity;

import org.apache.http.HttpEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import java.io.IOException;

/**
 * Author: TaoTao  2019/9/26
 */
public class intefaceTest {
    public static void main(String[] args) throws IOException {
        CloseableHttpClient httpClient = HttpClients.createDefault();//創建httpClient
        HttpGet httpGet = new HttpGet("http://www.cnblogs.com/");//創建httpget實例

        CloseableHttpResponse response = httpClient.execute(httpGet);//執行get請求
        HttpEntity entity = response.getEntity();//獲取返回實體
        String content =  EntityUtils.toString(entity,"utf-8");//網頁內容
        response.close();//關閉流和釋放系統資源

        Jsoup.parse(content);
        Document doc = Jsoup.parse(content);//解析網頁得到文檔對象
        Elements elements = doc.getElementsByTag("title");//獲取tag是title的所有dom文檔
        Element element = elements.get(0);//獲取第一個元素
        String title = element.text(); //.html是返回html
        System.out.println("網頁標題:"+title);
        Element element1 = doc.getElementById("site_nav_top");//獲取id=site_nav_top標簽
        String str = element1.text();
        System.out.println("str:"+str);
    }
}

 


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM