最近研究C#的爬蟲寫法,搞了半天,才在網上很多的寫法中整理出了一個簡單的demo(本人菜鳥,大神勿噴)。一是為了自己記錄一下以免日后用到,二是為了供需要朋友參考。
廢話不多說,上代碼
1 using HtmlAgilityPack; 2 using System; 3 using System.Collections.Generic; 4 using System.IO; 5 using System.Linq; 6 using System.Net; 7 using System.Text; 8 using System.Threading.Tasks; 9 10 namespace Crawler 11 { 12 class Program 13 { 14 static void Main(string[] args) 15 { 16 17 //WebProxy proxyObject = new WebProxy(IP, PORT);//這里我是用的代理。 18 19 //向指定地址發送請求 20 HttpWebRequest HttpWReq = (HttpWebRequest)WebRequest.Create("http://news.baidu.com/"); 21 //HttpWReq.Proxy = proxyObject; 22 HttpWReq.Timeout = 10000; 23 HttpWebResponse HttpWResp = (HttpWebResponse)HttpWReq.GetResponse(); 24 StreamReader sr = new StreamReader(HttpWResp.GetResponseStream(), Encoding.GetEncoding("UTF-8")); 25 HtmlDocument doc = new HtmlDocument(); 26 doc.Load(sr); 27 HtmlNodeCollection ulNodes = doc.DocumentNode.SelectSingleNode("//div[@id='pane-news']").SelectNodes("ul"); 28 if (ulNodes != null && ulNodes.Count > 0) 29 { 30 for (int i = 0; i < ulNodes.Count; i++) 31 { 32 HtmlNodeCollection liNodes = ulNodes[i].SelectNodes("li"); 33 for (int j = 0; j < liNodes.Count; j++) 34 { 35 string title = liNodes[j].SelectSingleNode("a").InnerHtml.Trim(); 36 string href = liNodes[j].SelectSingleNode("a").GetAttributeValue("href", "").Trim(); 37 Console.WriteLine("新聞標題:" + title + ",鏈接:" + href); 38 } 39 } 40 } 41 Console.ReadLine(); 42 sr.Close(); 43 HttpWResp.Close(); 44 HttpWReq.Abort(); 45 } 46 } 47 }
其中解析html的寫法用到了XPath的語法,大家可以自行百度下,比較簡單。
