C#獲取網頁信息核心方法(入門一)


目錄:信息采集入門系列目錄

 下面記錄的是我自己整理的C#請求頁面核心類,主要有如下幾個方法

1.HttpWebRequest Get請求獲得頁面html

2.HttpWebRequest Post請求獲得頁面html

3.模擬登錄獲得cookie內容

4.模擬登錄獲得cookie字符串

5.代理的設置

6.利用webbrowser 獲取js生成的頁面

7.為webbrowser設置cookie,模擬登錄

8.使用demo

HttpWebRequest Get請求獲得頁面html

注意點:以前抓取覺得很慢,最后發現是代理的問題,沒有代理就設置為null,這樣就不用每次去找代理,影響執行效率,還有一些參數可以自習設置,比如模擬瀏覽器等。

        /// <summary>
        /// get請求獲得頁面的html
        /// </summary>
        /// <param name="url">需要獲取的url</param>
        /// <param name="proxy">代理,沒有設置為null,不然每次去讀代理造成請求很慢</param>
        /// <param name="cookie">該網站所需要的cookie</param>
        /// <param name="timeout">超時時間</param>
        /// <returns>頁面請求后的html</returns>
        public static string Crawl(string url, WebProxy proxy, CookieContainer cookie, int timeout = 10000)
        {
            string result = string.Empty;
            HttpWebRequest request = null;
            WebResponse response = null;
            StreamReader streamReader = null;
            try
            {
                request = (HttpWebRequest)HttpWebRequest.Create(url);
                request.Proxy = proxy;
                request.Timeout = timeout;
                request.AllowAutoRedirect = true;
                request.CookieContainer = cookie;
                response = (HttpWebResponse)request.GetResponse();
                streamReader = new StreamReader(response.GetResponseStream(), Encoding.UTF8);
                result = streamReader.ReadToEnd();

            }
            catch (Exception ex)
            {
                throw ex;
            }
            finally
            {
                if (request != null)
                {
                    request.Abort();
                }
                if (response != null)
                {
                    response.Close();
                }
                if (streamReader != null)
                {
                    streamReader.Dispose();
                }
            }

            return result;
        }

 

HttpWebRequest Post請求獲得頁面html

        /// <summary>
        /// post請求獲得頁面
        /// </summary>
        /// <param name="url">需要獲取的url</param>
        /// <param name="postdata">post的數據字符串,如id=1&name=test</param>
        /// <param name="proxy">代理</param>
        /// <param name="cookie">coolie</param>
        /// <param name="timeout">超時</param>
        /// <returns></returns>
        public static string Crawl(string url, string postdata,WebProxy proxy, CookieContainer cookie, int timeout = 10000)
        {
            string result = string.Empty;
            HttpWebRequest request = null;
            WebResponse response = null;
            StreamReader streamReader = null;
            try
            {
                request = (HttpWebRequest)HttpWebRequest.Create(url);
                request.Proxy = proxy;
                request.Timeout = timeout;
                request.AllowAutoRedirect = true;
                request.CookieContainer = cookie;

                byte[] bs = Encoding.ASCII.GetBytes(postdata);
                string responseData = String.Empty;
                request.Method = "POST";
                request.ContentType = "application/x-www-form-urlencoded";
                request.ContentLength = bs.Length;
                using (Stream reqStream = request.GetRequestStream())
                {
                    reqStream.Write(bs, 0, bs.Length);
                    reqStream.Close();
                }
                response = (HttpWebResponse)request.GetResponse();
                streamReader = new StreamReader(response.GetResponseStream(), Encoding.UTF8);
                result = streamReader.ReadToEnd();

            }
            catch (Exception ex)
            {
                throw ex;
            }
            finally
            {
                if (request != null)
                {
                    request.Abort();
                }
                if (response != null)
                {
                    response.Close();
                }
                if (streamReader != null)
                {
                    streamReader.Dispose();
                }
            }

            return result;
        }

 

模擬登錄獲得cookie內容

先找到登錄的頁面,分析登錄頁面的post參數和鏈接,獲得cookie后可以直接傳到上面的方法

        /// <summary>
        ///根據模擬請求頁面獲得cookie
        /// </summary>
        /// <param name="url">模擬的url</param>
        /// <returns>cookie</returns>
        public static CookieContainer GetCookie(string url, WebProxy proxy, int timeout = 10000)
        {
            HttpWebRequest request = null;
            HttpWebResponse response = null;
            try
            {
                CookieContainer cc = new CookieContainer();
                request = (HttpWebRequest)HttpWebRequest.Create(url);
                request.Proxy = proxy;
                request.Timeout = timeout;
                request.AllowAutoRedirect = true;
                request.CookieContainer = cc;
                response = (HttpWebResponse)request.GetResponse();
                response.Cookies = request.CookieContainer.GetCookies(request.RequestUri);
                return cc;
            }
            catch (Exception ex)
            {
                throw ex;
            }
            finally
            {
                if (request != null)
                {
                    request.Abort();
                }
                if (response != null)
                {
                    response.Close();
                }
            }

        }

 

模擬登錄獲得cookie字符串

        /// <summary>
        /// 獲得cookie字符串,webbrowser可以使用
        /// </summary>
        /// <param name="url"></param>
        /// <param name="proxy"></param>
        /// <param name="timeout"></param>
        /// <returns></returns>
        public static string GetCookieString(string url, WebProxy proxy, int timeout = 10000)
        {
            HttpWebRequest request = null;
            HttpWebResponse response = null;
            try
            {
                CookieContainer cc = new CookieContainer();
                request = (HttpWebRequest)HttpWebRequest.Create(url);
                request.Proxy = proxy;
                request.Timeout = timeout;
                request.AllowAutoRedirect = true;
                request.CookieContainer = cc;
                response = (HttpWebResponse)request.GetResponse();
                response.Cookies = request.CookieContainer.GetCookies(request.RequestUri);
                string strcrook = request.CookieContainer.GetCookieHeader(request.RequestUri);
                return strcrook;

            }
            catch (Exception ex)
            {
                throw ex;
            }
            finally
            {
                if (request != null)
                {
                    request.Abort();
                }
                if (response != null)
                {
                    response.Close();
                }
            }
        }

 

代理的設置

       /// <summary>
        /// 創建代理
        /// </summary>
        /// <param name="port">代理端口</param>
        /// <param name="user">用戶名</param>
        /// <param name="password">密碼</param>
        /// <returns></returns>
        public static WebProxy CreatePorxy(string port, string user, string password)
        {
            WebProxy proxy = new WebProxy(); 
            proxy.Address = new Uri(port); 
            proxy.Credentials = new NetworkCredential(user, password); 
            return proxy;
        }

 

利用webbrowser 獲取js生成的頁面

說明:由於不知道頁面什么時候執行完成,這里是等待5s,默認執行完成,效率有待提高。

另外執行需要線程安全添加[STAThread]

        /// <summary>
        /// 抓取js生成的頁面
        /// </summary>
        /// <param name="url"></param>
        /// <returns></returns>
        public static string CrawlDynamic(string url)
        {
            WebBrowser browser = new WebBrowser();

            browser.ScriptErrorsSuppressed = true;

            browser.Navigate(url);

            //先要等待加載完畢
            while (browser.ReadyState != WebBrowserReadyState.Complete)
            {
                Application.DoEvents();
            }

            System.Timers.Timer timer = new System.Timers.Timer();

            var isComplete = false;

            timer.Elapsed += new System.Timers.ElapsedEventHandler((sender, e) =>
            {
                //加載完畢
                isComplete = true;

                timer.Stop();
            });

            timer.Interval = 1000 * 5;

            timer.Start();

            //繼續等待 5s,等待js加載完
            while (!isComplete)
                Application.DoEvents();

            var htmldocument = browser.Document;
            return htmldocument.ActiveElement.InnerHtml;
        }

 

為webbrowser設置cookie,模擬登錄

 剛開始始終不成功以為這個方法不能用,后面發現原來是doain設置有問題,我的例子是www.aa.xxx.com,設置的為http://xx.com可以使用,這個地方可能需要根據自己的情況來選擇域名。

        [DllImport("wininet.dll", CharSet = CharSet.Auto, SetLastError = true)]
        public static extern bool InternetSetCookie(string lpszUrlName, string lbszCookieName, string lpszCookieData);

        /// <summary>
        /// 為webbrowser設置cookie
        /// </summary>
        /// <param name="cookieStr">cookie字符串,可以從上面方法獲得</param>
        /// <param name="domain">需要設置的域名</param>
        public static void SetCookie(string cookieStr,string domain)
        {
            foreach (string c in cookieStr.Split(';'))
            {
                string[] item = c.Split('=');
                if (item.Length == 2)
                {
                    string name = item[0];
                    string value = item[1];
                    InternetSetCookie(domain, name, value);
                }

            }
        }

使用demo

            //代理,沒有就直接傳null
            WebProxy proxy = WebCrawl.WebRequestHelper.CreatePorxy("xx.com", "user", "password");

            //根據登錄頁得到cookie
            CookieContainer cookie = WebCrawl.WebRequestHelper.GetCookie("http://xxxx.login.com", proxy);

            //獲取頁面
            string html = WebCrawl.WebRequestHelper.Crawl("http://xxx.index.com", proxy, cookie);

            //根據登錄頁得到cookie字符串
            string cookiestr = WebCrawl.WebRequestHelper.GetCookieString("http://xxxx.login.com", proxy);

            //為webbrowser設置cookie
            WebCrawl.WebRequestHelper.SetCookie(cookiestr, "https://xx.com");

            //獲取需要登錄切用js生成的頁面,當然普通頁面也可以
            string htmlWithJs = WebCrawl.WebRequestHelper.CrawlDynamic("http://xxx.index.com");

 


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM