C#去掉HTML標記


該方法親測可行,下面直接粘貼代碼.

public string RemoveHTMLTags(string htmlStream)
        {
            if (htmlStream == null)
            {
                throw new Exception("Your input html stream is null!");
                return null;
            }
            /*
             * 最好把所有的特殊HTML標記都找出來,然后把與其相對應的Unicode字符一起影射到Hash表內,最后一起都替換掉
             */
            //先單獨測試,成功后,再把所有模式合並
            //注:這兩個必須單獨處理
            //去掉嵌套了HTML標記的JavaScript:(<script)[\\s\\S]*(</script>)
            //去掉css標記:(<style)[\\s\\S]*(</style>)
            //去掉css標記:\\..*\\{[\\s\\S]*\\}
            htmlStream = Regex.Replace(htmlStream, "(<script)[\\s\\S]*?(</script>)|(<style)[\\s\\S]*?(</style>)", " ", RegexOptions.IgnoreCase);
            //htmlStream = RemoveTag(htmlStream, "script");
            //htmlStream = RemoveTag(htmlStream, "style");
            //去掉普通HTML標記:<[^>]+>
            //替換空格:&nbsp;|&amp;|&shy;|&#160;|&#173;
            htmlStream = Regex.Replace(htmlStream, "<[^>]+>|&nbsp;|&amp;|&shy;|&#160;|&#173;|&bull;|&lt;|&gt;", " ", RegexOptions.IgnoreCase);
            //htmlStream = RemoveTag(htmlStream);
            //替換左尖括號
            //htmlStream = Regex.Replace(htmlStream, "&lt;", "<");
            //替換右尖括號
            //htmlStream = Regex.Replace(htmlStream, "&gt;", ">");
            //替換空行
            //htmlStream = Regex.Replace(htmlStream, "[\n|\r|\t]", " ");//[\n|\r][\t*| *]*[\n|\r]
            htmlStream = Regex.Replace(htmlStream, "(\r\n[\r|\n|\t| ]*\r\n)|(\n[\r|\n|\t| ]*\n)", "\r\n");
            htmlStream = Regex.Replace(htmlStream, "[\t| ]{1,}", " ");
            return htmlStream.Trim();
        }

 


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM