1 /// <summary> 2 /// 去除HTML标记 3 /// </summary> 4 /// <param name=”NoHTML”>包括HTML的源码 </param> 5 /// <returns>已经去除后的文字</returns> 6 public static string NoHTML(string Htmlstring) 7 { 8 //删除脚本 9 Htmlstring = Regex.Replace(Htmlstring, @"<script[^>]*?>.*?</script>", "", 10 RegexOptions.IgnoreCase); 11 //删除HTML 12 Htmlstring = Regex.Replace(Htmlstring, @"<(.[^>]*)>", "", 13 RegexOptions.IgnoreCase); 14 Htmlstring = Regex.Replace(Htmlstring, @"([\r\n])[\s]+", "", 15 RegexOptions.IgnoreCase); 16 Htmlstring = Regex.Replace(Htmlstring, @"–>", "", RegexOptions.IgnoreCase); 17 Htmlstring = Regex.Replace(Htmlstring, @"<!–.*", "", RegexOptions.IgnoreCase); 18 Htmlstring = Regex.Replace(Htmlstring, @"&(quot|#34);", "\"", 19 RegexOptions.IgnoreCase); 20 Htmlstring = Regex.Replace(Htmlstring, @"&(amp|#38);", "&", 21 RegexOptions.IgnoreCase); 22 Htmlstring = Regex.Replace(Htmlstring, @"&(lt|#60);", "<", 23 RegexOptions.IgnoreCase); 24 Htmlstring = Regex.Replace(Htmlstring, @"&(gt|#62);", ">", 25 RegexOptions.IgnoreCase); 26 Htmlstring = Regex.Replace(Htmlstring, @"&(nbsp|#160);", " ", 27 RegexOptions.IgnoreCase); 28 Htmlstring = Regex.Replace(Htmlstring, @"&(iexcl|#161);", "\xa1", RegexOptions.IgnoreCase); 29 Htmlstring = Regex.Replace(Htmlstring, @"&(cent|#162);", "\xa2", RegexOptions.IgnoreCase); 30 Htmlstring = Regex.Replace(Htmlstring, @"&(pound|#163);", "\xa3", RegexOptions.IgnoreCase); 31 Htmlstring = Regex.Replace(Htmlstring, @"&(copy|#169);", "\xa9", RegexOptions.IgnoreCase); 32 Htmlstring = Regex.Replace(Htmlstring, @"&#(\d+);", "", RegexOptions.IgnoreCase); 33 Htmlstring.Replace("<", ""); 34 Htmlstring.Replace(">", ""); 35 Htmlstring.Replace("\r\n", ""); 36 Htmlstring = HttpContext.Current.Server.HtmlEncode(Htmlstring).Trim(); 37 return Htmlstring; 38 }
1 写一个静态方法移除HTML标签 2 #region 3 /// <summary> 4 /// 移除HTML标签 5 /// </summary> 6 /// <param name="HTMLStr">HTMLStr</param> 7 public static string ParseTags(string HTMLStr) 8 { 9 return System.Text.RegularExpressions.Regex.Replace(HTMLStr, "<[^>]*>", ""); 10 } 11 #endregion
1 #region 2 /// <summary> 3 /// 取出文本中的图片地址 4 /// </summary> 5 /// <param name="HTMLStr">HTMLStr</param> 6 public static string GetImgUrl(string HTMLStr) 7 { 8 string str = string.Empty; 9 string sPattern = @"^<img\s+[^>]*>"; 10 Regex r = new Regex(@"<img\s+[^>]*\s*src\s*=\s*([']?)(?<url>\S+)'?[^>]*>", 11 RegexOptions.Compiled); 12 Match m = r.Match(HTMLStr.ToLower()); 13 if (m.Success) 14 str = m.Result("${url}"); 15 return str; 16 } 17 #endregion
1 /// <summary> 2 /// 提取HTML代码中文字的C#函数 3 /// </summary> 4 /// <param name="strHtml">包括HTML的源码 </param> 5 /// <returns>已经去除后的文字</returns> 6 using System; 7 using System.Text.RegularExpressions; 8 public class StripHTMLTest 9 { 10 public static void Main() 11 { 12 string s = StripHTML( 13 "<HTML><HEAD><TITLE>中国石龙信息平台</TITLE></HEAD><BODY>faddfs龙信息平台</BODY></HTML>"); 14 Console.WriteLine(s); 15 } 16 17 public static string StripHTML(string strHtml) 18 { 19 string[]aryReg = 20 { 21 @"<script[^>]*?>.*?</script>", 22 23 @"<(\/\s*)?!?((\w+:)?\w+)(\w+(\s*=?\s*(([""'])(\\[" 24 "'tbnr]|[^\7])*?\7|\w+)|.{0})|\s)*?(\/\s*)?>", @"([\r\n])[\s]+", @ 25 "&(quot|#34);", @"&(amp|#38);", @"&(lt|#60);", @"&(gt|#62);", @ 26 "&(nbsp|#160);", @"&(iexcl|#161);", @"&(cent|#162);", @"&(pound|#163);", 27 @"&(copy|#169);", @"&#(\d+);", @"-->", @"<!--.*\n" 28 }; 29 30 string[]aryRep = 31 { 32 "", "", "", "\"", "&", "<", ">", " ", "\xa1", //chr(161), 33 "\xa2", //chr(162), 34 "\xa3", //chr(163), 35 "\xa9", //chr(169), 36 "", "\r\n", "" 37 }; 38 39 string newReg = aryReg[0]; 40 string strOutput = strHtml; 41 for (int i = 0; i < aryReg.Length; i++) 42 { 43 Regex regex = new Regex(aryReg[i], RegexOptions.IgnoreCase); 44 strOutput = regex.Replace(strOutput, aryRep[i]); 45 } 46 strOutput.Replace("<", ""); 47 strOutput.Replace(">", ""); 48 strOutput.Replace("\r\n", ""); 49 return strOutput; 50 } 51 }