C# HtmlAgilityPack爬取靜態頁面

本文轉載自查看原文 2019-09-04 22:59 540 C#/ 學習記錄

最近對爬蟲很感興趣，稍微研究了一下，利用HtmlAgilityPack制作了一個十分簡單的爬蟲，這個簡易爬蟲只能獲取靜態頁面的Html

HtmlAgilityPack簡介

HtmlAgilityPack是一個解析速度十分快，並且開源的Html解析工具，並且HtmlAgilityPack支持使用Xpath解析Html，能夠幫助我們解析Html文檔就像解析Xml文檔一樣輕松、方便。

C#安裝HtmlAgilityPack

如果VS安裝有Nuget，在Nuget直接搜索安裝即可。
下載后解壓縮后有3個文件，這里只需要將其中的HtmlAgilityPack.dll、HtmlAgilityPack.xml引入解決方案中即可使用

實例(獲取某頁面圖片)

加載HTML頁面

//從網頁中加載
string url = "https://www.bilibili.com";
HtmlWeb web = new HtmlWeb();
HtmlDocument hd = web.Load(url);

利用WebClient寫一個圖片下載器

需要using System.Net和using System.IO

/// <summary>
/// 圖片下載器
/// </summary>
public class ImgDownloader
{
    /// <summary>
    /// 下載圖片
    /// </summary>
    /// <param name="webClient"></param>
    /// <param name="url">圖片url</param>
    /// <param name="folderPath">文件夾路徑</param>
    /// <param name="fileName">圖片名</param>
    public static void DownloadImg(WebClient webClient, string url, string folderPath, string fileName)
    {
        //如果文件夾不存在，則創建一個
        if (!Directory.Exists(folderPath))
        {
            Directory.CreateDirectory(folderPath);
        }
        //判斷路徑是否完整，補全不完整的路徑
        if (url.IndexOf("https:") == -1 && url.IndexOf("http:") == -1)
        {
            url = "https:" + url;
        }
        //下載圖片
        try
        {
            webClient.DownloadFile(url, folderPath + fileName);
            Console.WriteLine(fileName + "下載成功");
        }
        catch (Exception ex)
        {
            Console.Write(ex.Message);
            Console.WriteLine(url);
        }
    }
}

通過Xpath獲取img標簽中的圖片

string imgPath = "//img";//選擇img
int imgNum = 0;//圖片編號
//獲取img標簽中的圖片
foreach (HtmlNode node in hd.DocumentNode.SelectNodes(imgPath))
{
    if (node.Attributes["src"] != null)
    {
        string imgUrl = node.Attributes["src"].Value.ToString();
        if (imgUrl != "" && imgUrl != " ")
        {
            imgNum++;
            //生成文件名，自動獲取后綴
            string fileName = imgNum + imgUrl.Substring(imgUrl.LastIndexOf("."));
            ImgDownloader.DownloadImg(wc, imgUrl, "images/", fileName);
        }
    }
}

通過Xpath獲取背景圖

//獲取背景圖
string bgImgPath = "//*[@style]";//選擇具有style屬性的節點
foreach (HtmlNode node in hd.DocumentNode.SelectNodes(bgImgPath))
{
    if (node.Attributes["style"].Value.Contains("background-image:url"))
    {
        imgNum++;
        string bgImgUrl = node.Attributes["style"].Value;
        bgImgUrl = Regex.Match(bgImgUrl, @"(?<=\().+?(?=\))").Value;//讀取url()的內容
        //Console.WriteLine(bgImgUrl);
        //生成文件名，自動獲取后綴
        string fileName = imgNum + bgImgUrl.Substring(bgImgUrl.LastIndexOf("."));

        ImgDownloader.DownloadImg(wc, bgImgUrl, "images/bgcImg/", fileName);
    }
}

完整代碼

using System.Linq;
using System.Text;
using System.Threading.Tasks;
using System.Net;
using System.IO;
using HtmlAgilityPack;
using System.Text.RegularExpressions;

namespace WebCrawlerDemo
{
    class Program
    {
        static void Main(string[] args)
        {
            WebClient wc = new WebClient();
            

            string url = "https://www.bilibili.com";
            HtmlWeb web = new HtmlWeb();
            HtmlDocument hd = web.Load(url);//下載html頁面

            string imgPath = "//img";//選擇img
            
            int imgNum = 0;//圖片編號
            
            //獲取img標簽中的圖片
            foreach (HtmlNode node in hd.DocumentNode.SelectNodes(imgPath))
            {
                if (node.Attributes["src"] != null)
                {
                    string imgUrl = node.Attributes["src"].Value.ToString();
                    if (imgUrl != "" && imgUrl != " ")
                    {
                        imgNum++;
                        //生成文件名，自動獲取后綴
                        string fileName = imgNum + imgUrl.Substring(imgUrl.LastIndexOf("."));

                        ImgDownloader.DownloadImg(wc, imgUrl, "images/", fileName);
                    }
                }
            }
            //獲取背景圖
            string bgImgPath = "//*[@style]";//選擇具有style屬性的節點
            foreach (HtmlNode node in hd.DocumentNode.SelectNodes(bgImgPath))
            {
                if (node.Attributes["style"].Value.Contains("background-image:url"))
                {
                    imgNum++;
                    string bgImgUrl = node.Attributes["style"].Value;
                    bgImgUrl = Regex.Match(bgImgUrl, @"(?<=\().+?(?=\))").Value;//讀取url()的內容
                    //生成文件名，自動獲取后綴
                    string fileName = imgNum + bgImgUrl.Substring(bgImgUrl.LastIndexOf("."));

                    ImgDownloader.DownloadImg(wc, bgImgUrl, "images/bgcImg/", fileName);
                }
            }

            Console.WriteLine("----------END----------");
            Console.ReadKey();
        }
    }
    /// <summary>
    /// 圖片下載器
    /// </summary>
    public class ImgDownloader
    {
        /// <summary>
        /// 下載圖片
        /// </summary>
        /// <param name="webClient"></param>
        /// <param name="url">圖片url</param>
        /// <param name="folderPath">文件夾路徑</param>
        /// <param name="fileName">圖片名</param>
        public static void DownloadImg(WebClient webClient, string url, string folderPath, string fileName)
        {
            //如果文件夾不存在，則創建一個
            if (!Directory.Exists(folderPath))
            {
                Directory.CreateDirectory(folderPath);
            }
            //判斷路徑是否完整，補全不完整的路徑
            if (url.IndexOf("https:") == -1 && url.IndexOf("http:") == -1)
            {
                url = "https:" + url;
            }
            //下載圖片
            try
            {
                webClient.DownloadFile(url, folderPath + fileName);
                Console.WriteLine(fileName + "下載成功");
            }
            catch (Exception ex)
            {
                Console.Write(ex.Message);
                Console.WriteLine(url);
            }
        }
    }
}

參考文章

免責聲明！

本站轉載的文章為個人學習借鑒使用，本站對版權不負任何法律責任。如果侵犯了您的隱私權益，請聯系本站郵箱yoyou2525@163.com刪除。

猜您在找 C# HtmlAgilityPack+Selenium爬取需要拉動滾動條的頁面內容 C#使用HtmlAgilityPack解析Html 爬取圖片和視頻 C#使用phantomjs，爬取AJAX加載完成之后的頁面爬取靜態網頁 C# 爬取網頁數據 C# 爬取網頁上的數據 c# winform webBrowser爬取數據爬蟲技術（六）-- 使用HtmlAgilityPack獲取頁面鏈接（附c#代碼及插件下載）爬蟲技術 -- 進階學習（九）使用HtmlAgilityPack獲取頁面鏈接（附c#代碼及插件下載） C#使用HtmlAgilityPack快速爬蟲