private void button1_Click(object sender, EventArgs e) { string s1 = this.textBox1.Text; //正則表達式內容 //string match = @"^(http|https|ftp)\://[a-zA-Z0-9\-\.]+\.[a-zA-Z]{2,3}(:[a-zA-Z0-9]*)?/?([a-zA-Z0-9\-\._\?\,\'/\\\+&%\$#\=~])*$"; //string match = @"[a-zA-Z0-9\-\.]+\.[a-zA-Z]{2,3}(:[a-zA-Z0-9]*)?/?([a-zA-Z0-9\-\._\?\,\'/\\\+&%\$#\=~])*$"; string match = @"[a-zA-z]+://[^\s]*"; //初始化正則表達式實例 Regex reg = new Regex(match); //開始驗證 bool HasValidate = reg.IsMatch(s1); if (HasValidate) { //MessageBox.Show("這是網站有效URL格式。"); try { string tmp = GetHtml(s1); string tmpend = StripHTML(tmp); } catch (Exception) { //MessageBox.Show("3.該網站只能手動查詢!"); } } }
1.獲取HTML
GetHtml(String Url)
View Code
/// <summary> /// 獲取有效的HTML /// </summary> /// <param name="Url"></param> /// <returns></returns> public String GetHtml(String Url) { string sException = null; string sRslt = null; string GBsRslt = null; StreamReader htm = null; WebResponse oWebRps = null; WebResponse bWebRps = null; int a = 0; WebRequest oWebRqst = WebRequest.Create(Url); oWebRqst.Timeout = 50000; WebRequest bWebRqst = WebRequest.Create(Url); bWebRqst.Timeout = 50000; try { oWebRps = oWebRqst.GetResponse(); bWebRps = bWebRqst.GetResponse(); } catch (WebException e) { sException = e.Message.ToString(); MessageBox.Show(sException); } catch (Exception e) { sException = e.ToString(); MessageBox.Show(sException); } finally { if (oWebRps != null) { StreamReader oStreamRd = new StreamReader( oWebRps.GetResponseStream(), Encoding.GetEncoding("UTF-8") ); StreamReader GBoStreamRd = new StreamReader( bWebRps.GetResponseStream(), Encoding.GetEncoding("GB2312") ); sRslt = oStreamRd.ReadToEnd(); GBsRslt = GBoStreamRd.ReadToEnd(); if (!isLuan(sRslt)) //判斷utf8是否有亂碼 { htm = oStreamRd; } else { htm = GBoStreamRd; } if (htm == oStreamRd) { a = 1; } else { a = 2; } oStreamRd.Close(); GBoStreamRd.Close(); oWebRps.Close(); } } if (a == 1) { return sRslt; } else { return GBsRslt; } }
2.去除HTML標記(正則表達式)
StripHTML(string strHtml)
View Code
1 /// <summary> 2 /// 去除HTML標記 3 /// </summary> 4 /// <param name="strHtml">包括HTML的源碼 </param> 5 /// <returns>已經去除后的文字</returns> 6 public static string StripHTML(string strHtml) 7 { 8 //regex_str="<script type=\\s*[^>]*>[^<]*?</script>";//替換<script>內容</script>為空格 9 string regex_str = "(?is)<script[^>]*>.*?</script>";//替換<script>內容</script>為空格 10 strHtml = Regex.Replace(strHtml, regex_str, ""); 11 12 //regex_str="<script type=\\s*[^>]*>[^<]*?</script>";//替換<style>內容</style>為空格 13 regex_str = "(?is)<style[^>]*>.*?</style>";//替換<style>內容</style>為空格 14 strHtml = Regex.Replace(strHtml, regex_str, ""); 15 16 //regex_str = "( )+";//替換 為空格 17 regex_str = "(?i) ";//替換 為空格 18 strHtml = Regex.Replace(strHtml, regex_str, " "); 19 20 //regex_str = "(\r\n)*";//替換\r\n為空 21 regex_str = @"[\r\n]*";//替換\r\n為空 22 strHtml = Regex.Replace(strHtml, regex_str, "", RegexOptions.IgnoreCase); 23 24 //regex_str = "<[^<]*>";//替換Html標簽為空 25 regex_str = "<[^<>]*>";//替換Html標簽為空 26 strHtml = Regex.Replace(strHtml, regex_str, ""); 27 28 //regex_str = "\n*";//替換\n為空 29 regex_str = @"\n*";//替換\n為空 30 strHtml = Regex.Replace(strHtml, regex_str, "", RegexOptions.IgnoreCase); 31 32 //可以這樣 33 regex_str = "\t*";//替換\t為空 34 strHtml = Regex.Replace(strHtml, regex_str, "", RegexOptions.IgnoreCase); 35 36 //可以 37 regex_str = "'";//替換'為’ 38 strHtml = Regex.Replace(strHtml, regex_str, "’", RegexOptions.IgnoreCase); 39 40 //可以 41 regex_str = " +";//替換若干個空格為一個空格 42 strHtml = Regex.Replace(strHtml, regex_str, " ", RegexOptions.IgnoreCase); 43 44 Regex regex = new Regex("<.+?>", RegexOptions.IgnoreCase); 45 46 string strOutput = regex.Replace(strHtml, "");//替換掉"<"和">"之間的內容 47 strOutput = strOutput.Replace("<", ""); 48 strOutput = strOutput.Replace(">", ""); 49 strOutput = strOutput.Replace(" ", ""); 50 51 52 return strOutput; 53 54 }
3.判斷是否為亂碼(編碼):在StripHTML里調用。
View Code
//判斷是否為亂碼 bool isLuan(string txt) { var bytes = Encoding.UTF8.GetBytes(txt); //239 191 189 for (var i = 0; i < bytes.Length; i++) { if (i < bytes.Length - 3) if (bytes[i] == 239 && bytes[i + 1] == 191 && bytes[i + 2] == 189) { return true; } } return false; }
