該方法親測可行,下面直接粘貼代碼.
public string RemoveHTMLTags(string htmlStream) { if (htmlStream == null) { throw new Exception("Your input html stream is null!"); return null; } /* * 最好把所有的特殊HTML標記都找出來,然后把與其相對應的Unicode字符一起影射到Hash表內,最后一起都替換掉 */ //先單獨測試,成功后,再把所有模式合並 //注:這兩個必須單獨處理 //去掉嵌套了HTML標記的JavaScript:(<script)[\\s\\S]*(</script>) //去掉css標記:(<style)[\\s\\S]*(</style>) //去掉css標記:\\..*\\{[\\s\\S]*\\} htmlStream = Regex.Replace(htmlStream, "(<script)[\\s\\S]*?(</script>)|(<style)[\\s\\S]*?(</style>)", " ", RegexOptions.IgnoreCase); //htmlStream = RemoveTag(htmlStream, "script"); //htmlStream = RemoveTag(htmlStream, "style"); //去掉普通HTML標記:<[^>]+> //替換空格: |&|­| |­ htmlStream = Regex.Replace(htmlStream, "<[^>]+>| |&|­| |­|•|<|>", " ", RegexOptions.IgnoreCase); //htmlStream = RemoveTag(htmlStream); //替換左尖括號 //htmlStream = Regex.Replace(htmlStream, "<", "<"); //替換右尖括號 //htmlStream = Regex.Replace(htmlStream, ">", ">"); //替換空行 //htmlStream = Regex.Replace(htmlStream, "[\n|\r|\t]", " ");//[\n|\r][\t*| *]*[\n|\r] htmlStream = Regex.Replace(htmlStream, "(\r\n[\r|\n|\t| ]*\r\n)|(\n[\r|\n|\t| ]*\n)", "\r\n"); htmlStream = Regex.Replace(htmlStream, "[\t| ]{1,}", " "); return htmlStream.Trim(); }