應該先說,本來相對網頁加載的程序段進行規范的,但是,當再次編寫的時候發現,還是不能很好的掌握網頁加載的具體規則,導致獲取頁面的代碼還是很繁雜。其他部分改的差不多了,還有就是當微博中的字符含有{}等時,會提示字符串格式錯誤,這個也該需要改進的,,還沒改進,程序還需要一個掛空線程的功能,保留現場,讓程序可以繼續爬取,而不是從頭再爬。
各種類
各種類
using System; using System.Collections.Generic; using System.Linq; using System.Text; using System.Windows.Forms; using System.IO; using HtmlAgilityPack; namespace WeiBoGrab { class WeiBoGrabClass { } public class GetPage { //加載初始頁面 public string GetLoginPage(WebBrowser browser) { while (browser.ReadyState != WebBrowserReadyState.Complete) { Application.DoEvents(); } while (browser.Document.GetElementById("pl_login_form").InnerHtml == null) { Application.DoEvents(); } return "加載登陸頁面完成。"; } //加載用戶主頁 public string GetMainPage(WebBrowser browser) { while (browser.DocumentTitle != "我的首頁 新浪微博-隨時隨地分享身邊的新鮮事兒") { Application.DoEvents(); } //確保加載完所需內容 while (browser.Document.GetElementById("pl_rightmod_myinfo")!=null&& browser.Document.GetElementById("pl_rightmod_myinfo").Children.Count < 2) { Application.DoEvents(); } return "加載個人主頁完成。"; } //加載用戶關注對象的第一頁 public string GetFollowsPage(WebBrowser browser) { while (browser.DocumentTitle != "我關注的人 新浪微博-隨時隨地分享身邊的新鮮事兒") { Application.DoEvents(); } while (browser.Document.GetElementById("pl_relation_myfollow") == null) { Application.DoEvents(); } while (browser.Document.GetElementById("pl_relation_myfollow").Children.Count < 3) { Application.DoEvents(); } return "關注對象頁面第一頁加載完成。"; } //加載用戶關注對象的下一頁 public string GetFollowsNextPage(WebBrowser browser) { //將原頁面的關注對象列表清空(關注對象列表為children[2].children[1]) //加載新頁面3=browser.Document.GetElementById("pl_relation_myfollow").Children[2].Children.Count //不明白,孩子個數顯示明明是3,但是述操作卻正確。。。 //browser.Document.GetElementById("pl_relation_myfollow").Children[2].Children.Count < 4 //<!-- -->此類標簽有時會被當做標簽計數或提取,需要實際分析 while (browser.Document.GetElementById("pl_relation_myfollow").Children.Count < 3|| browser.Document.GetElementById("pl_relation_myfollow").Children[2].Children.Count < 4) { Application.DoEvents(); } //當上述條件滿足后,再加載,便是新生成的內容 return "關注對象下一頁加載完成。"; } //加載關注對象的主頁的第一頁 public string GetFollowMainPage(WebBrowser browser) { while (browser.ReadyState != WebBrowserReadyState.Complete) { Application.DoEvents(); } //當微博是雜志、新聞類時 if (browser.Document.GetElementById("epfeedlist") != null) { while (browser.Document.GetElementById("feed_list") == null) { Application.DoEvents(); } return "關注對象主頁第一頁加載完成。"; } //當微博是個人、媒體類時 if (browser.Document.GetElementById("pl_content_hisFeed") == null) { while (browser.Document.GetElementById("profileFeed").InnerHtml == null) { Application.DoEvents(); } } while (browser.Document.GetElementById("pl_content_hisFeed").InnerHtml == null) { Application.DoEvents(); } //找到feed HtmlElementCollection ps = browser.Document.GetElementById("pl_content_hisFeed").Children; int feed_postion = 0; //有的微博頁面需要此步驟 while (browser.Document.GetElementById("pl_content_hisFeed").Children[feed_postion].InnerText == "正在加載,請稍候..." || browser.Document.GetElementById("pl_content_hisFeed").Children[feed_postion].InnerText == "正在加載中,請稍候...") { Application.DoEvents(); } //pl_content_hisFeed加載不全 while (browser.Document.GetElementById("pl_content_hisFeed").Children.Count < 2) { Application.DoEvents(); } foreach (HtmlElement p in ps) { if (p.GetAttribute("node-type") != null && p.GetAttribute("node-type") == "feed_list") { break; } else feed_postion++; } //非第一頁加載時,有此等待 while (browser.Document.GetElementById("pl_content_hisFeed").Children[feed_postion].Children[0].InnerText == "正在加載中,請稍候..." || browser.Document.GetElementById("pl_content_hisFeed").Children[feed_postion].Children[0].InnerText == "正在加載,請稍候...") { Application.DoEvents(); } //微博數量及等待加載模塊所在位置表示 int hisFeed_count = browser.Document.GetElementById("pl_content_hisFeed").Children[feed_postion].Children.Count - 1; //表示正在加載 bool loading = true; //找出加載模塊位置 HtmlElement load = browser.Document.GetElementById("pl_content_hisFeed").Children[feed_postion].Children[hisFeed_count]; int i; for (i = 1; (i < 10) && (hisFeed_count - i >= 0); i++) { if (load.InnerText == "正在加載中,請稍候..." || load.InnerText == "正在加載,請稍候...") break; load = browser.Document.GetElementById("pl_content_hisFeed").Children[feed_postion].Children[hisFeed_count - i]; } while (loading) { loading = false; load.ScrollIntoView(false); while (load.InnerText == "正在加載中,請稍候..." || load.InnerText == "正在加載,請稍候...") { load.ScrollIntoView(false); Application.DoEvents(); load = browser.Document.GetElementById("pl_content_hisFeed").Children[feed_postion].Children[hisFeed_count - i]; } //微博加載 //限制次數,limit有待商榷,過小會使有的微博可能會加載失敗 int Limit = 100; int L = 0; while ((browser.Document.GetElementById("pl_content_hisFeed").Children[feed_postion].Children.Count < hisFeed_count + 2)&& (L < Limit )) { L++;//防止無限加載的等待 Application.DoEvents(); } //更新加載模塊位置 hisFeed_count = browser.Document.GetElementById("pl_content_hisFeed").Children[feed_postion].Children.Count - 1; //更新加載模塊 load = browser.Document.GetElementById("pl_content_hisFeed").Children[feed_postion].Children[hisFeed_count]; for (int j = 1; (j < 10) && (hisFeed_count - j >= 0); j++)//假設無效的標簽數不超過10個 { if (load.InnerText == "正在加載中,請稍候..." || load.InnerText == "正在加載,請稍候...") break; load = browser.Document.GetElementById("pl_content_hisFeed").Children[feed_postion].Children[hisFeed_count - j]; } if (load != null && (load.InnerText == "正在加載中,請稍候..." || load.InnerText == "正在加載,請稍候...")) { loading = true; load.ScrollIntoView(false); } } return "加載關注對象主頁第一頁面完成。"; } //加載關注對象的的主頁的下一頁 public string GetFollowMainNextPage(WebBrowser browser) { Application.DoEvents(); while (browser.ReadyState != WebBrowserReadyState.Complete) { Application.DoEvents(); } GetFollowMainPage(browser); //針對雜志、新聞類微博 if (browser.Document.GetElementById("epfeedlist") == null) Application.DoEvents(); return "加載關注對象后續頁面完成。"; } } //用戶登陸類 public class LoginSubmit { private string username; private string password; //初始化登陸對象 public LoginSubmit(string username, string password) { this.username = username; this.password = password; } //點擊登陸 public void LoginClick(WebBrowser browser) { //登陸頁面的登陸模塊 HtmlElement pl_login_form = browser.Document.GetElementById("pl_login_form"); //登陸模塊中的用戶名_INPUT HtmlElement pl_login_form_username = pl_login_form.GetElementsByTagName("INPUT")[0]; //讓用戶名輸入框獲取焦點(目的清空輸入框) pl_login_form_username.InvokeMember("click"); pl_login_form_username.SetAttribute("value",username); //登陸模塊的密碼_INPUT HtmlElement pl_login_form_password = pl_login_form.GetElementsByTagName("INPUT")[1]; //讓密碼輸入框獲取焦點(目的清空輸入框) pl_login_form_password.InvokeMember("click"); pl_login_form_password.SetAttribute("value",password); //找到登陸按鈕並點擊 HtmlElementCollection IsClick = pl_login_form.GetElementsByTagName("span"); foreach (HtmlElement Click in IsClick) { if (Click.GetAttribute("node-type") != null && Click.GetAttribute("node-type") == "submitStates") { Click.InvokeMember("click"); break; } } } } //將關注對象設為一類 public class Follow { //獲取關注對象(點擊用戶關注對象的超鏈接) public void GetFollows(WebBrowser browser) { //獲取用戶的信息模塊 HtmlElement pl_rightmod_myinfo = browser.Document.GetElementById("pl_rightmod_myinfo"); //獲取關注對象子模塊 HtmlElement my_info_follow = pl_rightmod_myinfo.GetElementsByTagName("strong")[0]; if (my_info_follow.GetAttribute("node-type") == "follow") { //判斷用戶是否有關注對象 if (my_info_follow.InnerText == "0") return; my_info_follow.InvokeMember("click"); GetPage getfollowpage = new GetPage(); getfollowpage.GetFollowsPage(browser); } } //獲取關注對象的url,並寫到txt中 public void GetFollowsUrl(WebBrowser browser, StreamWriter sw) { //是否還有下一頁 bool Next = true; int UrlCount = 0; while (Next) { //默認沒有下一頁 Next = false; HtmlElement FollowLinks = browser.Document.GetElementById("pl_relation_myfollow"); HtmlElementCollection Links = FollowLinks.GetElementsByTagName("div"); foreach (HtmlElement Link in Links) { if (Link.GetAttribute("action-type") == "ignore_list") { HtmlNode href = HtmlNode.CreateNode(Link.InnerHtml); string url = href.Attributes["href"].Value; string followname = href.FirstChild.Attributes["alt"].Value; sw.WriteLine("No.{0}|{1}|{2}", ++UrlCount, followname, url); } } HtmlElementCollection pages = FollowLinks.GetElementsByTagName("span"); //判斷是否有下一頁 foreach (HtmlElement page in pages) { if (page.InnerText == "下一頁") { Next = true; page.InvokeMember("click"); //Console.WriteLine("這個標簽是:"+browser.Document.GetElementById("pl_relation_myfollow").Children[2].Children[2].OuterHtml); browser.Document.GetElementById("pl_relation_myfollow").Children[2].Children[2].OuterHtml = null; //載入關注對象頁面的下一頁 GetPage GetNext = new GetPage(); GetNext.GetFollowsNextPage(browser); break; } } } sw.Close(); } } //將微博設為一類 public class WeiBo { private string FollowName; private string FollowUrl; public WeiBo(string FollowName, string FollowUrl) { this.FollowName = FollowName; this.FollowUrl = FollowUrl; } public void GetWeiBo(WebBrowser browser) { StreamWriter sw = File.CreateText("D:\\weibo\\" + FollowName + ".txt"); bool Next = true; int WeiBoCount = 0; browser.Navigate(new Uri(@FollowUrl)); GetPage GetNext = new GetPage(); GetNext.GetFollowMainPage(browser); //默認還沒登記此類微博 string Kind = "N"; HtmlElement epfeedlist = browser.Document.GetElementById("epfeedlist"); HtmlElement pl_content_hisFeed = browser.Document.GetElementById("pl_content_hisFeed"); if (pl_content_hisFeed != null) { //媒體類微博的pl_content_hisFeed.Children[1].Children[0].TagName = "dl" //個人微博的pl_content_hisFeed.Children[1].OuterHtml =<!-- /高級搜索 --> if (pl_content_hisFeed.Children[1].Children.Count != 0) //媒體(小)微博 Kind = "M"; //個人微博 else Kind = "P"; } if (epfeedlist != null) //雜志,新聞等微博 Kind = "J"; while (Next) { Next = false; switch (Kind) { case "P": { //爬取各條微博 HtmlElementCollection divs = browser.Document.GetElementById("pl_content_hisFeed").GetElementsByTagName("div"); foreach (HtmlElement div in divs) { if (div.GetAttribute("node-type") == "feed_list_content") sw.WriteLine("第{0}條|" + div.InnerText, ++WeiBoCount); } //判斷是否還有下一頁 HtmlElementCollection spans = browser.Document.GetElementById("pl_content_hisFeed").GetElementsByTagName("span"); foreach (HtmlElement span in spans) { if (span.InnerText == "下一頁") { span.InvokeMember("click"); Next = true; GetNext.GetFollowMainNextPage(browser); break; } } }break; case "J": { //爬取各條微博 int count_li = browser.Document.GetElementById("feed_list").Children.Count; for (int i = 0; i < count_li; i++) { sw.WriteLine("第{0}條|" + browser.Document.GetElementById("feed_list").Children[i].GetElementsByTagName("p")[0].InnerText, ++WeiBoCount); } //判斷是否還有下一頁 HtmlElementCollection ems = browser.Document.GetElementById("feed_list").NextSibling.GetElementsByTagName("em"); int end = ems.Count; if (ems[end - 1].InnerText == "下一頁") { ems[end - 1].InvokeMember("click"); browser.Document.GetElementById("feed_list").OuterHtml = null; GetNext.GetFollowMainNextPage(browser); Next = true; } }break; case "M": { HtmlElementCollection ps = browser.Document.GetElementById("pl_content_hisFeed").GetElementsByTagName("p"); foreach (HtmlElement p in ps) { if (p.GetAttribute("node-type") == "feed_list_content") sw.WriteLine("第{0}條|" + p.InnerText, ++WeiBoCount); } //判斷是否還有下一頁 HtmlElementCollection spans = browser.Document.GetElementById("pl_content_hisFeed").GetElementsByTagName("span"); foreach (HtmlElement span in spans) { if (span.InnerText == "下一頁") { span.InvokeMember("click"); Next = true; GetNext.GetFollowMainNextPage(browser); break; } } }break; default: return;//還沒記錄的微博 } } sw.Close(); } } }
using System; using System.Collections.Generic; using System.ComponentModel; using System.Data; using System.Drawing; using System.Linq; using System.Text; using System.Windows.Forms; using System.IO; namespace WeiBoGrab { public partial class Form1 : Form { public Form1() { InitializeComponent(); } private void button1_Click(object sender, EventArgs e) { string username = textBox1.Text.ToString(); string password = textBox2.Text.ToString(); string url = "http://weibo.com/"; GetPage getpage = new GetPage(); StreamWriter sw = File.CreateText("FollowUrl.txt"); WebBrowser browser = webBrowser1; browser.Navigate(new Uri(@url)); //加載登陸頁面 textBox3.Text += getpage.GetLoginPage(browser); //登陸操作 LoginSubmit loginsubmit = new LoginSubmit(username, password); loginsubmit.LoginClick(browser); //加載個人主頁 textBox3.Text += getpage.GetMainPage(browser); //獲取關注對象 Follow follow = new Follow(); follow.GetFollows(browser); follow.GetFollowsUrl(browser,sw); FileStream fs = new FileStream("FollowUrl.txt",FileMode.Open); StreamReader sr = new StreamReader(fs); string s; while ((s = sr.ReadLine()) != null) { string[] arry = s.Split('|'); string name = arry[1]; string user_url = arry[2]; WeiBo feed = new WeiBo(name, user_url); feed.GetWeiBo(browser); } sr.Close(); } } }