獲取HTML源碼（只取文字，判斷編碼，過濾標簽）

本文轉載自查看原文 2012-12-04 16:10 2995 C#/ 常用代碼/ Web - HTML\CSS

private void button1_Click(object sender, EventArgs e)
        {
            string s1 = this.textBox1.Text;
            //正則表達式內容
            //string match = @"^(http|https|ftp)\://[a-zA-Z0-9\-\.]+\.[a-zA-Z]{2,3}(:[a-zA-Z0-9]*)?/?([a-zA-Z0-9\-\._\?\,\'/\\\+&%\$#\=~])*$";
            //string match = @"[a-zA-Z0-9\-\.]+\.[a-zA-Z]{2,3}(:[a-zA-Z0-9]*)?/?([a-zA-Z0-9\-\._\?\,\'/\\\+&%\$#\=~])*$";
            string match = @"[a-zA-z]+://[^\s]*";
            //初始化正則表達式實例
            Regex reg = new Regex(match);
            //開始驗證
            bool HasValidate = reg.IsMatch(s1);

            if (HasValidate)
            {
                //MessageBox.Show("這是網站有效URL格式。");
                try
                {
                    string tmp = GetHtml(s1);
                    string tmpend = StripHTML(tmp);

                }
                catch (Exception)
                {
                    //MessageBox.Show("3.該網站只能手動查詢！");
                }
            }
        }

1.獲取HTML

GetHtml(String Url)

View Code

        /// <summary>
        /// 獲取有效的HTML
        /// </summary>
        /// <param name="Url"></param>
        /// <returns></returns>
        public String GetHtml(String Url)
        {
            string sException = null;

            string sRslt = null;
            string GBsRslt = null;
            StreamReader htm = null;
            WebResponse oWebRps = null;
            WebResponse bWebRps = null;
            int a = 0;

            WebRequest oWebRqst = WebRequest.Create(Url);

            oWebRqst.Timeout = 50000;

            WebRequest bWebRqst = WebRequest.Create(Url);

            bWebRqst.Timeout = 50000;

            try
            {
                oWebRps = oWebRqst.GetResponse();
                bWebRps = bWebRqst.GetResponse();
            }
            catch (WebException e)
            {
                sException = e.Message.ToString();

                MessageBox.Show(sException);
            }
            catch (Exception e)
            {
                sException = e.ToString();

                MessageBox.Show(sException);
            }
            finally
            {
                if (oWebRps != null)
                {
                    StreamReader oStreamRd = new StreamReader(
                        oWebRps.GetResponseStream(), Encoding.GetEncoding("UTF-8")
                        );

                    StreamReader GBoStreamRd = new StreamReader(
                        bWebRps.GetResponseStream(), Encoding.GetEncoding("GB2312")
                        );

                    sRslt = oStreamRd.ReadToEnd();
                    GBsRslt = GBoStreamRd.ReadToEnd();

                    if (!isLuan(sRslt)) //判斷utf8是否有亂碼
                    {
                        htm = oStreamRd;
                    }

                    else
                    {
                        htm = GBoStreamRd;
                    }

                    if (htm == oStreamRd)
                    {
                        a = 1;
                    }
                    else
                    {
                        a = 2;
                    }

                    oStreamRd.Close();
                    GBoStreamRd.Close();
                    oWebRps.Close();

                }
            }
            if (a == 1)
            {
                return sRslt;
            }
            else
            {
                return GBsRslt;
            }

        }

2.去除HTML標記（正則表達式）

StripHTML(string strHtml)

View Code

 1         /// <summary>
 2         /// 去除HTML標記
 3         /// </summary>
 4         /// <param name="strHtml">包括HTML的源碼 </param>
 5         /// <returns>已經去除后的文字</returns>
 6         public static string StripHTML(string strHtml)
 7         {
 8             //regex_str="<script type=\\s*[^>]*>[^<]*?</script>";//替換<script>內容</script>為空格
 9             string regex_str = "(?is)<script[^>]*>.*?</script>";//替換<script>內容</script>為空格
10             strHtml = Regex.Replace(strHtml, regex_str, "");
11 
12             //regex_str="<script type=\\s*[^>]*>[^<]*?</script>";//替換<style>內容</style>為空格
13             regex_str = "(?is)<style[^>]*>.*?</style>";//替換<style>內容</style>為空格
14             strHtml = Regex.Replace(strHtml, regex_str, "");
15 
16             //regex_str = "(&nbsp;)+";//替換&nbsp;為空格
17             regex_str = "(?i)&nbsp;";//替換&nbsp;為空格
18             strHtml = Regex.Replace(strHtml, regex_str, " ");
19 
20             //regex_str = "(\r\n)*";//替換\r\n為空
21             regex_str = @"[\r\n]*";//替換\r\n為空
22             strHtml = Regex.Replace(strHtml, regex_str, "", RegexOptions.IgnoreCase);
23 
24             //regex_str = "<[^<]*>";//替換Html標簽為空
25             regex_str = "<[^<>]*>";//替換Html標簽為空
26             strHtml = Regex.Replace(strHtml, regex_str, "");
27 
28             //regex_str = "\n*";//替換\n為空
29             regex_str = @"\n*";//替換\n為空
30             strHtml = Regex.Replace(strHtml, regex_str, "", RegexOptions.IgnoreCase);
31 
32             //可以這樣
33             regex_str = "\t*";//替換\t為空
34             strHtml = Regex.Replace(strHtml, regex_str, "", RegexOptions.IgnoreCase);
35 
36             //可以
37             regex_str = "'";//替換'為’
38             strHtml = Regex.Replace(strHtml, regex_str, "’", RegexOptions.IgnoreCase);
39 
40             //可以
41             regex_str = " +";//替換若干個空格為一個空格
42             strHtml = Regex.Replace(strHtml, regex_str, "  ", RegexOptions.IgnoreCase);
43 
44             Regex regex = new Regex("<.+?>", RegexOptions.IgnoreCase);
45 
46             string strOutput = regex.Replace(strHtml, "");//替換掉"<"和">"之間的內容
47             strOutput = strOutput.Replace("<", "");
48             strOutput = strOutput.Replace(">", "");
49             strOutput = strOutput.Replace("&nbsp;", "");
50 
51 
52             return strOutput;
53 
54         }

3.判斷是否為亂碼（編碼）:在StripHTML里調用。

View Code

        //判斷是否為亂碼
        bool isLuan(string txt)
        {

            var bytes = Encoding.UTF8.GetBytes(txt);

            //239 191 189

            for (var i = 0; i < bytes.Length; i++)
            {

                if (i < bytes.Length - 3)

                    if (bytes[i] == 239 && bytes[i + 1] == 191 && bytes[i + 2] == 189)
                    {

                        return true;

                    }
            }

            return false;

        }

免責聲明！

本站轉載的文章為個人學習借鑒使用，本站對版權不負任何法律責任。如果侵犯了您的隱私權益，請聯系本站郵箱yoyou2525@163.com刪除。

猜您在找 富文本怎么只獲取文字內容去除html標簽 UEditor 過濾HTML標簽 PHP過濾各種html標簽 js過濾HTML標簽以及正則過濾html的標簽 HTML a標簽文字顏色 HTML中的文字標簽 Python通過正則表達式去除(過濾)HTML標簽，提取文字 Java對html標簽的過濾和清洗 PHP實現過濾各種HTML標簽