網站采集(根據正則表達式截取需要的html數據)


所有網站都可以通過url地址獲取該網站編譯之后的html源代碼,方法如下:

         需要用到的命名空間:

             using System;

using System.Collections.Generic;

using System.Text;

using System.Diagnostics;

using System.Text.RegularExpressions;

using System.IO;

using System.Net;

        

        /// <summary>  

        /// 取得網頁源碼  

        /// </summary>  

        /// <param name="url">網頁地址,eg:"http://www.xxx.com/" </param>   

        /// <param name="charset">網頁編碼,eg:"utf-8"</param>  

        /// <returns>返回網頁源文件</returns>  

        public static string GetHtmlSource(string url, string charset)

        {

            //編碼處理   

            Encoding nowCharset;

            if (charset == "" || charset == null)

            {

                nowCharset = Encoding.Default;

            }

            else

            {

                nowCharset = Encoding.GetEncoding(charset);

            }

 

            //處理內容  

            string html = "";

            try

            {

                //WebRequest myWebRequest = WebRequest.Create(url);  

                //WebResponse myWebResponse = myWebRequest.GetResponse();  

                //Stream stream = myWebResponse.GetResponseStream();  

                //StreamReader reader = new StreamReader(stream, nowCharset);  

 

                HttpWebRequest request = (HttpWebRequest)WebRequest.Create(url);

                HttpWebResponse response = (HttpWebResponse)request.GetResponse();

                Stream stream = response.GetResponseStream();

                StreamReader reader = new StreamReader(stream, nowCharset);

                html = reader.ReadToEnd();

                stream.Close();

            }

            catch (Exception e)

            {

            }

            return html;

        }

 

        /// <summary>  

        /// 取得網頁源碼  

        /// </summary>  

        /// <param name="url">網頁地址,eg: "http://www.xxx.com/" </param>   

        /// <param name="charset">網頁編碼,eg: Encoding.UTF8</param>  

        /// <returns>返回網頁源文件</returns>  

        public static string GetHtmlSource(string url, Encoding charset)

        {

            //處理內容  

            string html = "";

            try

            {

                HttpWebRequest request = (HttpWebRequest)WebRequest.Create(url);

                HttpWebResponse response = (HttpWebResponse)request.GetResponse();

                Stream stream = response.GetResponseStream();

                StreamReader reader = new StreamReader(stream, charset);

                html = reader.ReadToEnd();

                stream.Close();

            }

            catch (Exception e)

            {

            }

            return html;

        }

 

        /// <summary>  

        /// 取得網頁源碼  

        /// 對於帶BOM的網頁很有效,不管是什么編碼都能正確識別  

        /// </summary>  

        /// <param name="url">網頁地址,eg: "http://www.xxx.com/" </param>   

        /// <returns>返回網頁源文件</returns>  

        public static string GetHtmlSource(string url)

        {

            //處理內容  

            string html = "";

            try

            {

                HttpWebRequest request = (HttpWebRequest)WebRequest.Create(url);

                HttpWebResponse response = (HttpWebResponse)request.GetResponse();

                Stream stream = response.GetResponseStream();

                StreamReader reader = new StreamReader(stream, Encoding.Default);

                html = reader.ReadToEnd();

                stream.Close();

            }

            catch (Exception e)

            {

            }

            return html;

        }  

 

根據不同的情況調用不同的方式去獲取,例如:

            string _html = Collection.GetHtmlSource("http://www.luohx.com/a.html", "utf-8");

也可以在url參數里面附帶參數,例如

   string _html = Collection.GetHtmlSource("http://www.luohx.com/a.aspx?a=1&b=2", "utf-8");

 

  當采集到網站源碼后,會發現,我們的需求往往不是什么代碼都需要,只是需要其中的一部分,比如:標簽<div id=”xml” class” wrap”></div>中間的html,那么,我們需要對源代碼進行截取,方法如下:

      #region 獲取畫冊頁面代碼

        public string strHtml(string url, string charset)

        {

            string _html = Collection.GetHtmlSource(url, charset);//根據url獲取網站html

            string sss = "";

            //正則表達式

            string pattern = @"(?six)<div\s+id=""xml""\s+class=""wrap"">

                                (?'MyCont'

                                  (?>

                                    (?!<div\b|</div>).

                                    |

                                    <div(?:\s+(?:""[^""]*""|'[^']*'|[^""'>])*)?>(?'div')

                                    |

                                    </div>(?'-div')

                                  )*

                                  (?(div)(?!))

                                )

                                </div>";

            foreach (Match m in Regex.Matches(_html, pattern))

            {

                sss = m.Groups["MyCont"].Value;

            }

 

            return sss;

        }

        #endregion

  這里的參數pattern就是 針對標簽<div id=”xml” class” wrap”></div>的正則表達式,但是,必須保證,選取的參考對象的唯一的格式,不能同時存在2個或者2個以上的<div id=”xml” class” wrap”></div>,這樣就不能用這個標簽作為參考來判定。

   當截取需要的html代碼模塊的時候,我們發現,得到的還是部分的html代碼,如果我們需要的是不包含html元素的內容的時候,就將內容去掉html的標簽,例如:、

           public static string checkStr(string html)

        {

            System.Text.RegularExpressions.Regex regex1 = new System.Text.RegularExpressions.Regex(@"<script[\s\S]+</script *>", System.Text.RegularExpressions.RegexOptions.IgnoreCase);

            System.Text.RegularExpressions.Regex regex2 = new System.Text.RegularExpressions.Regex(@" href *= *[\s\S]*script *:", System.Text.RegularExpressions.RegexOptions.IgnoreCase);

            System.Text.RegularExpressions.Regex regex3 = new System.Text.RegularExpressions.Regex(@" no[\s\S]*=", System.Text.RegularExpressions.RegexOptions.IgnoreCase);

            System.Text.RegularExpressions.Regex regex4 = new System.Text.RegularExpressions.Regex(@"<iframe[\s\S]+</iframe *>", System.Text.RegularExpressions.RegexOptions.IgnoreCase);

            System.Text.RegularExpressions.Regex regex5 = new System.Text.RegularExpressions.Regex(@"<frameset[\s\S]+</frameset *>", System.Text.RegularExpressions.RegexOptions.IgnoreCase);

            System.Text.RegularExpressions.Regex regex6 = new System.Text.RegularExpressions.Regex(@"\<img[^\>]+\>", System.Text.RegularExpressions.RegexOptions.IgnoreCase); System.Text.RegularExpressions.Regex regex7 = new System.Text.RegularExpressions.Regex(@"</p>", System.Text.RegularExpressions.RegexOptions.IgnoreCase);

            System.Text.RegularExpressions.Regex regex8 = new System.Text.RegularExpressions.Regex(@"<p>", System.Text.RegularExpressions.RegexOptions.IgnoreCase);

            System.Text.RegularExpressions.Regex regex9 = new System.Text.RegularExpressions.Regex(@"<[^>]*>", System.Text.RegularExpressions.RegexOptions.IgnoreCase);

            html = regex1.Replace(html, "");

            html = regex2.Replace(html, "");

            html = regex3.Replace(html, " _disibledevent=");

            html = regex4.Replace(html, "");

            html = regex5.Replace(html, "");

            html = regex6.Replace(html, "");

            html = regex7.Replace(html, "");

            html = regex8.Replace(html, "");

            html = regex9.Replace(html, "");

            html = html.Replace(" ", " ");

            html = html.Replace("</strong>", "");

            html = html.Replace("<strong>", "");

            return html;

        }

     調用方法很簡單,直接string strhtml= checkStr(html)就可以了,當得到所需要的數據時,就可以入庫、顯示等其他的操作了~

 


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM