爬蟲抓取數據的思路是,根據url地址去獲取html,然后解析html,取出需要的數據
首先需要引入HtmlAgilityPack的dll(下載HtmlAgilityPack.dll)
主要是使用HtmlDocument類來加載獲取到的html代碼,轉換為HtmlDocument對象操作
HtmlAgilityPack.HtmlDocument doc = new HtmlAgilityPack.HtmlDocument();
doc.LoadHtml(html);
獲取到HtmlDocument對象以后,根據xpath過濾出對應的節點
//找到class=zg_itemImmersion的div節點
string xpathDiv = "//div[@class='zg_itemImmersion']";
HtmlNodeCollection allDivs = doc.DocumentNode.SelectNodes(xpathDiv);
xpath語法可以自行網上查找,簡單實用,很好理解
完整代碼如下:

public static void GetData(string url, ref DataTable dt) { try { //WebClient獲取Amazon的html會返回校驗頁面的html //WebClient wc = new WebClient(); //string html = wc.DownloadString(url); //HtmlWeb方式獲取html,獲取多次以后,后續會加載不到html //HtmlWeb web = new HtmlWeb(); //HtmlAgilityPack.HtmlDocument doc = web.Load(url); //找到排行的每個商品節點 //通過HttpWebRequest方式獲取html string html = WebRequestPost(url); HtmlAgilityPack.HtmlDocument doc = new HtmlAgilityPack.HtmlDocument(); doc.LoadHtml(html); string xpathDiv = "//div[@class='zg_itemImmersion']";//找到class=zg_itemImmersion的div節點 HtmlNodeCollection allDivs = doc.DocumentNode.SelectNodes(xpathDiv); for (int i = 0; i < allDivs.Count; i++) { if (i > 2) break; //需要把allDivs里面的node重新轉換為HtmlNode才能取到對應節點的信息,否則取到的一直都是第一個 HtmlNode node = HtmlNode.CreateNode(allDivs[i].InnerHtml); DataRow dr = dt.NewRow(); //過濾商品排名 string xpath = "//span[@class='zg_rankNumber']";//找到class=zg_rankNumber的span節點 string indexText = node.SelectSingleNode(xpath).InnerText.Replace(".", "").Replace("\n", "").TrimStart().TrimEnd(); int rank = int.Parse(indexText); dr["排名"] = rank; //過濾商品名稱 xpath = "//div[@class='p13n-sc-truncate p13n-sc-truncated-hyphen p13n-sc-line-clamp-2']";//找到對應class的div節點 string name = node.SelectSingleNode(xpath).InnerText.Replace("\n", "").TrimStart().TrimEnd(); dr["商品名稱"] = name; //過濾商品價格 xpath = "//span[@class='p13n-sc-price']";//找到class=p13n-sc-price的span節點 string price = node.SelectSingleNode(xpath).InnerText.Replace("\n", ""); dr["售價"] = price; //過濾商品明細連接,position()從1開始 xpath = "//a[@class='a-link-normal' and position()=1]";//找到class=a-link-normal的並且位置是第一個的a節點 string href = node.SelectSingleNode(xpath).Attributes["href"].Value; href = "https://www.amazon.com" + href; string htmlDetail = WebRequestPost(href); HtmlAgilityPack.HtmlDocument docDetail = new HtmlAgilityPack.HtmlDocument(); docDetail.LoadHtml(htmlDetail); xpath = "//div[@id='detailBulletsWrapper_feature_div']";//找到id=detailBulletsWrapper_feature_div的div節點 HtmlNode nodeDetail = docDetail.DocumentNode.SelectSingleNode(xpath); if (nodeDetail != null) { //過濾商品首次上架日期節點 //xpath = "//li[position()=5]//span[position()=2]";//不能直接取固定位置的li,因為有些商品的li數量不一致有些5個,有些6個 //找到包含有Date first available at Amazon.com文本內容的span節點的第一個span兄弟節點 xpath = "//span[contains(text(), 'Date first available at Amazon.com')]/following-sibling::span[1]"; string dateFrist = nodeDetail.SelectSingleNode(xpath).InnerText; dr["首次上架日期"] = dateFrist; //過濾商品分類排名信息 xpath = "//li[@id='SalesRank']/b/following::text()[1]";//找到id=SalesRank的li節點里面b節點相鄰的第一個文本節點 string categoryRank = nodeDetail.SelectSingleNode(xpath).InnerText.Replace("(", "");//獲取主分類排名 xpath = "//li[@id='SalesRank']/ul[@class='zg_hrsr']";//找到id=SalesRank的li節點里面class=zg_hrsr的ul節點 string detailRank = nodeDetail.SelectSingleNode(xpath).InnerText.Replace(" ", " ").Replace(">", ">");//獲取具體分類排名 dr["排名信息"] = categoryRank + detailRank; } dt.Rows.Add(dr); } } catch (Exception ex) { MessageBox.Show("爬蟲抓取失敗,失敗信息:" + ex.Message); } }
最后將html里面的內容解析出來以后,添加到DataTable,然后再導入到Excel