C# 新浪微博滾動抓取 WeiboGrab


應該先說,本來相對網頁加載的程序段進行規范的,但是,當再次編寫的時候發現,還是不能很好的掌握網頁加載的具體規則,導致獲取頁面的代碼還是很繁雜。其他部分改的差不多了,還有就是當微博中的字符含有{}等時,會提示字符串格式錯誤,這個也該需要改進的,,還沒改進,程序還需要一個掛空線程的功能,保留現場,讓程序可以繼續爬取,而不是從頭再爬。
各種類
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Windows.Forms;
using System.IO;
using HtmlAgilityPack;

namespace WeiBoGrab
{
    class WeiBoGrabClass
    {
    }
    
    public class GetPage
    {
        //加載初始頁面
        public string GetLoginPage(WebBrowser browser)
        {
            while (browser.ReadyState != WebBrowserReadyState.Complete)
            {
                Application.DoEvents();
            }
            while (browser.Document.GetElementById("pl_login_form").InnerHtml == null)
            {
                Application.DoEvents();
            }
            return "加載登陸頁面完成。";
        }
       //加載用戶主頁
        public string GetMainPage(WebBrowser browser)
        {
            while (browser.DocumentTitle != "我的首頁 新浪微博-隨時隨地分享身邊的新鮮事兒")
            {
                Application.DoEvents();
            }
             //確保加載完所需內容
            while (browser.Document.GetElementById("pl_rightmod_myinfo")!=null&&
                browser.Document.GetElementById("pl_rightmod_myinfo").Children.Count < 2)
            {
                Application.DoEvents();
            }
            
            return "加載個人主頁完成。";
        }
       //加載用戶關注對象的第一頁
        public string GetFollowsPage(WebBrowser browser)
        {
            while (browser.DocumentTitle != "我關注的人 新浪微博-隨時隨地分享身邊的新鮮事兒")
            {
                Application.DoEvents();
            }
            while (browser.Document.GetElementById("pl_relation_myfollow") == null)
            {
                Application.DoEvents();
            }
            while (browser.Document.GetElementById("pl_relation_myfollow").Children.Count < 3)
            {
                Application.DoEvents();
            }
            return "關注對象頁面第一頁加載完成。";
        }
        //加載用戶關注對象的下一頁
        public string GetFollowsNextPage(WebBrowser browser)
        {
            //將原頁面的關注對象列表清空(關注對象列表為children[2].children[1])
            //加載新頁面3=browser.Document.GetElementById("pl_relation_myfollow").Children[2].Children.Count
            //不明白,孩子個數顯示明明是3,但是述操作卻正確。。。 
            //browser.Document.GetElementById("pl_relation_myfollow").Children[2].Children.Count < 4
            //<!--  -->此類標簽有時會被當做標簽計數或提取,需要實際分析

            while (browser.Document.GetElementById("pl_relation_myfollow").Children.Count < 3||
                   browser.Document.GetElementById("pl_relation_myfollow").Children[2].Children.Count < 4)
            {
                Application.DoEvents();
            }
            //當上述條件滿足后,再加載,便是新生成的內容
            return "關注對象下一頁加載完成。";
        }
        //加載關注對象的主頁的第一頁
        public string GetFollowMainPage(WebBrowser browser)
        {
            while (browser.ReadyState != WebBrowserReadyState.Complete)
            {
                Application.DoEvents();
            }

            //當微博是雜志、新聞類時
            if (browser.Document.GetElementById("epfeedlist") != null)
            {
                while (browser.Document.GetElementById("feed_list") == null)
                {
                    Application.DoEvents();
                }
                return "關注對象主頁第一頁加載完成。";
            }
            //當微博是個人、媒體類時
            if (browser.Document.GetElementById("pl_content_hisFeed") == null)
            {
                while (browser.Document.GetElementById("profileFeed").InnerHtml == null)
                {
                    Application.DoEvents();
                }
            }
            while (browser.Document.GetElementById("pl_content_hisFeed").InnerHtml == null)
            {
                Application.DoEvents();
            }
            //找到feed
            HtmlElementCollection ps = browser.Document.GetElementById("pl_content_hisFeed").Children;
            int feed_postion = 0;
            //有的微博頁面需要此步驟
            while (browser.Document.GetElementById("pl_content_hisFeed").Children[feed_postion].InnerText == "正在加載,請稍候..." ||
                browser.Document.GetElementById("pl_content_hisFeed").Children[feed_postion].InnerText == "正在加載中,請稍候...")
            {
                Application.DoEvents();
            }
            //pl_content_hisFeed加載不全
            while (browser.Document.GetElementById("pl_content_hisFeed").Children.Count < 2)
            {
                Application.DoEvents();
            }
            foreach (HtmlElement p in ps)
            {
                if (p.GetAttribute("node-type") != null && p.GetAttribute("node-type") == "feed_list")
                {
                    break;
                }
                else
                    feed_postion++;
            }
            //非第一頁加載時,有此等待
            while (browser.Document.GetElementById("pl_content_hisFeed").Children[feed_postion].Children[0].InnerText == "正在加載中,請稍候..."
                   || browser.Document.GetElementById("pl_content_hisFeed").Children[feed_postion].Children[0].InnerText == "正在加載,請稍候...")
            {
                Application.DoEvents();
            }
            //微博數量及等待加載模塊所在位置表示
            int hisFeed_count = browser.Document.GetElementById("pl_content_hisFeed").Children[feed_postion].Children.Count - 1;
            //表示正在加載
            bool loading = true;
            //找出加載模塊位置
            HtmlElement load = browser.Document.GetElementById("pl_content_hisFeed").Children[feed_postion].Children[hisFeed_count];
            int i;
            for (i = 1; (i < 10) && (hisFeed_count - i >= 0); i++)
            {
                if (load.InnerText == "正在加載中,請稍候..." || load.InnerText == "正在加載,請稍候...")
                    break;
                load = browser.Document.GetElementById("pl_content_hisFeed").Children[feed_postion].Children[hisFeed_count - i];
            }
            while (loading)
            {
                loading = false;
                load.ScrollIntoView(false);
                while (load.InnerText == "正在加載中,請稍候..." || load.InnerText == "正在加載,請稍候...")
                {
                    load.ScrollIntoView(false);
                    Application.DoEvents();
                    load = browser.Document.GetElementById("pl_content_hisFeed").Children[feed_postion].Children[hisFeed_count - i];
                }
                //微博加載
                //限制次數,limit有待商榷,過小會使有的微博可能會加載失敗
                int Limit = 100;
                int L = 0;
                while ((browser.Document.GetElementById("pl_content_hisFeed").Children[feed_postion].Children.Count < hisFeed_count + 2)&&
                    (L < Limit ))
                {
                    L++;//防止無限加載的等待
                    Application.DoEvents();
                }
                //更新加載模塊位置
                hisFeed_count = browser.Document.GetElementById("pl_content_hisFeed").Children[feed_postion].Children.Count - 1;
                //更新加載模塊
                load = browser.Document.GetElementById("pl_content_hisFeed").Children[feed_postion].Children[hisFeed_count];
                for (int j = 1; (j < 10) && (hisFeed_count - j >= 0); j++)//假設無效的標簽數不超過10個
                {
                    if (load.InnerText == "正在加載中,請稍候..." || load.InnerText == "正在加載,請稍候...")
                        break;
                    load = browser.Document.GetElementById("pl_content_hisFeed").Children[feed_postion].Children[hisFeed_count - j];
                }
                if (load != null && (load.InnerText == "正在加載中,請稍候..." || load.InnerText == "正在加載,請稍候..."))
                {
                    loading = true;
                    load.ScrollIntoView(false);
                }
            }
            return "加載關注對象主頁第一頁面完成。";

        }
        //加載關注對象的的主頁的下一頁
        public string GetFollowMainNextPage(WebBrowser browser)
        {
            Application.DoEvents();
            while (browser.ReadyState != WebBrowserReadyState.Complete)
            {
                Application.DoEvents();
            }
            GetFollowMainPage(browser);
            //針對雜志、新聞類微博
            if (browser.Document.GetElementById("epfeedlist") == null)
                Application.DoEvents();
            return "加載關注對象后續頁面完成。";
        }
    }
    //用戶登陸類
    public class LoginSubmit
    {
        private string username;
        private string password;
        //初始化登陸對象
        public LoginSubmit(string username, string password)
        {
            this.username = username;
            this.password = password;
        }
        //點擊登陸
        public void LoginClick(WebBrowser browser)
        {
            //登陸頁面的登陸模塊
            HtmlElement pl_login_form = browser.Document.GetElementById("pl_login_form");
            //登陸模塊中的用戶名_INPUT
            HtmlElement pl_login_form_username = pl_login_form.GetElementsByTagName("INPUT")[0];
            //讓用戶名輸入框獲取焦點(目的清空輸入框)
            pl_login_form_username.InvokeMember("click");
            pl_login_form_username.SetAttribute("value",username);

            //登陸模塊的密碼_INPUT
            HtmlElement pl_login_form_password = pl_login_form.GetElementsByTagName("INPUT")[1];
            //讓密碼輸入框獲取焦點(目的清空輸入框)
            pl_login_form_password.InvokeMember("click");
            pl_login_form_password.SetAttribute("value",password);

            //找到登陸按鈕並點擊
            HtmlElementCollection IsClick = pl_login_form.GetElementsByTagName("span");
            foreach (HtmlElement Click in IsClick)
            {
                if (Click.GetAttribute("node-type") != null && Click.GetAttribute("node-type") == "submitStates")
                {
                    Click.InvokeMember("click");
                    break;
                }
            }
        }
    }
    //將關注對象設為一類
    public class Follow
    {
        //獲取關注對象(點擊用戶關注對象的超鏈接)
        public void GetFollows(WebBrowser browser)
        {
            //獲取用戶的信息模塊
            HtmlElement pl_rightmod_myinfo = browser.Document.GetElementById("pl_rightmod_myinfo");
            //獲取關注對象子模塊
            HtmlElement my_info_follow = pl_rightmod_myinfo.GetElementsByTagName("strong")[0];
            if (my_info_follow.GetAttribute("node-type") == "follow")
            {
                //判斷用戶是否有關注對象
                if (my_info_follow.InnerText == "0")
                    return;
                my_info_follow.InvokeMember("click");
                GetPage getfollowpage = new GetPage();
                getfollowpage.GetFollowsPage(browser);
            }
        }
        //獲取關注對象的url,並寫到txt中
        public void GetFollowsUrl(WebBrowser browser, StreamWriter sw)
        {
            //是否還有下一頁
            bool Next = true;
            int UrlCount = 0;
            while (Next)
            {
                //默認沒有下一頁
                Next = false;

                HtmlElement FollowLinks = browser.Document.GetElementById("pl_relation_myfollow");
                HtmlElementCollection Links = FollowLinks.GetElementsByTagName("div");

                foreach (HtmlElement Link in Links)
                {
                    if (Link.GetAttribute("action-type") == "ignore_list")
                    {
                        HtmlNode href = HtmlNode.CreateNode(Link.InnerHtml);

                        string url = href.Attributes["href"].Value;
                        string followname = href.FirstChild.Attributes["alt"].Value;

                        sw.WriteLine("No.{0}|{1}|{2}", ++UrlCount, followname, url);
                    }
                }
                HtmlElementCollection pages = FollowLinks.GetElementsByTagName("span");

                //判斷是否有下一頁
                foreach (HtmlElement page in pages)
                {
                    if (page.InnerText == "下一頁")
                    {
                        Next = true;
                        page.InvokeMember("click");
                        
                        //Console.WriteLine("這個標簽是:"+browser.Document.GetElementById("pl_relation_myfollow").Children[2].Children[2].OuterHtml);
                        browser.Document.GetElementById("pl_relation_myfollow").Children[2].Children[2].OuterHtml = null;
                        
                        //載入關注對象頁面的下一頁
                        GetPage GetNext = new GetPage();
                        GetNext.GetFollowsNextPage(browser);
                        break;
                    }
                }
            }
            sw.Close();
        }
    }
    //將微博設為一類
    public class WeiBo
    {
        private string FollowName;
        private string FollowUrl;

        public WeiBo(string FollowName, string FollowUrl)
        {
            this.FollowName = FollowName;
            this.FollowUrl = FollowUrl;
        }
        public void GetWeiBo(WebBrowser browser)
        {
            StreamWriter sw = File.CreateText("D:\\weibo\\" + FollowName + ".txt");
            bool Next = true;
            int WeiBoCount = 0;
            browser.Navigate(new Uri(@FollowUrl));
            GetPage GetNext = new GetPage();
            GetNext.GetFollowMainPage(browser);
            //默認還沒登記此類微博
            string Kind = "N";

            HtmlElement epfeedlist = browser.Document.GetElementById("epfeedlist");
            HtmlElement pl_content_hisFeed = browser.Document.GetElementById("pl_content_hisFeed");
            if (pl_content_hisFeed != null)
            {
                //媒體類微博的pl_content_hisFeed.Children[1].Children[0].TagName = "dl"
                
                //個人微博的pl_content_hisFeed.Children[1].OuterHtml =<!-- /高級搜索 -->
                if (pl_content_hisFeed.Children[1].Children.Count != 0)
                        //媒體(小)微博
                        Kind = "M";

                    //個人微博                
                else
                     Kind = "P";
            }
            if (epfeedlist != null)
                //雜志,新聞等微博
                Kind = "J";
            while (Next)
            {
                Next = false;
                switch (Kind)
                {
                    case "P": 
                        {
                            //爬取各條微博
                          HtmlElementCollection divs = browser.Document.GetElementById("pl_content_hisFeed").GetElementsByTagName("div");
                          foreach (HtmlElement div in divs)
                           {
                            if (div.GetAttribute("node-type") == "feed_list_content")
                                sw.WriteLine("第{0}條|" + div.InnerText, ++WeiBoCount);
                           }
                            //判斷是否還有下一頁
                          HtmlElementCollection spans = browser.Document.GetElementById("pl_content_hisFeed").GetElementsByTagName("span");
                          foreach (HtmlElement span in spans)
                          {
                              if (span.InnerText == "下一頁")
                              {
                                  span.InvokeMember("click");
                                  Next = true;
                                  GetNext.GetFollowMainNextPage(browser);
                                  break;
                              }
                          }
                        }break;
                    case "J":
                        {
                            //爬取各條微博
                            int count_li = browser.Document.GetElementById("feed_list").Children.Count;
                            for (int i = 0; i < count_li; i++)
                            {
                                sw.WriteLine("第{0}條|" + browser.Document.GetElementById("feed_list").Children[i].GetElementsByTagName("p")[0].InnerText, ++WeiBoCount);
                            }
                            //判斷是否還有下一頁
                            HtmlElementCollection ems = browser.Document.GetElementById("feed_list").NextSibling.GetElementsByTagName("em");
                            int end = ems.Count;
                            if (ems[end - 1].InnerText == "下一頁")
                            {
                                ems[end - 1].InvokeMember("click");
                                browser.Document.GetElementById("feed_list").OuterHtml = null;
                                GetNext.GetFollowMainNextPage(browser);
                                Next = true;
                            }
                        }break;
                    case "M":
                        {
                            HtmlElementCollection ps = browser.Document.GetElementById("pl_content_hisFeed").GetElementsByTagName("p");
                            foreach (HtmlElement p in ps)
                            {
                                if (p.GetAttribute("node-type") == "feed_list_content")
                                    sw.WriteLine("第{0}條|" + p.InnerText, ++WeiBoCount);
                            }
                            //判斷是否還有下一頁
                            HtmlElementCollection spans = browser.Document.GetElementById("pl_content_hisFeed").GetElementsByTagName("span");
                            foreach (HtmlElement span in spans)
                            {
                                if (span.InnerText == "下一頁")
                                {
                                    span.InvokeMember("click");
                                    Next = true;
                                    GetNext.GetFollowMainNextPage(browser);
                                    break;
                                }
                            }

                        }break;
                    default: return;//還沒記錄的微博
                }
            }
            sw.Close();
        }
    }
}
using System;
using System.Collections.Generic;
using System.ComponentModel;
using System.Data;
using System.Drawing;
using System.Linq;
using System.Text;
using System.Windows.Forms;
using System.IO;

namespace WeiBoGrab
{
    public partial class Form1 : Form
    {
        public Form1()
        {
            InitializeComponent();
        }

        private void button1_Click(object sender, EventArgs e)
        {
            string username = textBox1.Text.ToString();
            string password = textBox2.Text.ToString();
            string url = "http://weibo.com/";
            GetPage getpage = new GetPage();
            StreamWriter sw = File.CreateText("FollowUrl.txt");
            WebBrowser browser = webBrowser1;

            browser.Navigate(new Uri(@url));
            //加載登陸頁面
            textBox3.Text += getpage.GetLoginPage(browser);
            //登陸操作
            LoginSubmit loginsubmit = new LoginSubmit(username, password);
            loginsubmit.LoginClick(browser);
            //加載個人主頁
            textBox3.Text += getpage.GetMainPage(browser);
            //獲取關注對象
            Follow follow = new Follow();
            follow.GetFollows(browser);                     
            follow.GetFollowsUrl(browser,sw);

            FileStream fs = new FileStream("FollowUrl.txt",FileMode.Open);
            StreamReader sr = new StreamReader(fs);
            string s;
            while ((s = sr.ReadLine()) != null)
            {
                string[] arry = s.Split('|');
                string name = arry[1];
                string user_url = arry[2];
                WeiBo feed = new WeiBo(name, user_url);
                feed.GetWeiBo(browser);
            }
            sr.Close();
        }
    }
}

 


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM