網站采集（根據正則表達式截取需要的html數據）

本文轉載自查看原文 2012-01-31 16:22 3726 asp.net

所有網站都可以通過url地址獲取該網站編譯之后的html源代碼，方法如下：

需要用到的命名空間：

using System;

using System.Collections.Generic;

using System.Text;

using System.Diagnostics;

using System.Text.RegularExpressions;

using System.IO;

using System.Net;

/// <summary>

/// 取得網頁源碼

/// </summary>

/// <param name="url">網頁地址，eg:"http://www.xxx.com/" </param>

/// <param name="charset">網頁編碼，eg:"utf-8"</param>

/// <returns>返回網頁源文件</returns>

public static string GetHtmlSource(string url, string charset)

{

//編碼處理

Encoding nowCharset;

if (charset == "" || charset == null)

{

nowCharset = Encoding.Default;

}

else

{

nowCharset = Encoding.GetEncoding(charset);

}

//處理內容

string html = "";

try

{

//WebRequest myWebRequest = WebRequest.Create(url);

//WebResponse myWebResponse = myWebRequest.GetResponse();

//Stream stream = myWebResponse.GetResponseStream();

//StreamReader reader = new StreamReader(stream, nowCharset);

HttpWebRequest request = (HttpWebRequest)WebRequest.Create(url);

HttpWebResponse response = (HttpWebResponse)request.GetResponse();

Stream stream = response.GetResponseStream();

StreamReader reader = new StreamReader(stream, nowCharset);

html = reader.ReadToEnd();

stream.Close();

}

catch (Exception e)

{

}

return html;

}

/// <summary>

/// 取得網頁源碼

/// </summary>

/// <param name="url">網頁地址，eg: "http://www.xxx.com/" </param>

/// <param name="charset">網頁編碼，eg: Encoding.UTF8</param>

/// <returns>返回網頁源文件</returns>

public static string GetHtmlSource(string url, Encoding charset)

{

//處理內容

string html = "";

try

{

HttpWebRequest request = (HttpWebRequest)WebRequest.Create(url);

HttpWebResponse response = (HttpWebResponse)request.GetResponse();

Stream stream = response.GetResponseStream();

StreamReader reader = new StreamReader(stream, charset);

html = reader.ReadToEnd();

stream.Close();

}

catch (Exception e)

{

}

return html;

}

/// <summary>

/// 取得網頁源碼

/// 對於帶BOM的網頁很有效，不管是什么編碼都能正確識別

/// </summary>

/// <param name="url">網頁地址，eg: "http://www.xxx.com/" </param>

/// <returns>返回網頁源文件</returns>

public static string GetHtmlSource(string url)

{

//處理內容

string html = "";

try

{

HttpWebRequest request = (HttpWebRequest)WebRequest.Create(url);

HttpWebResponse response = (HttpWebResponse)request.GetResponse();

Stream stream = response.GetResponseStream();

StreamReader reader = new StreamReader(stream, Encoding.Default);

html = reader.ReadToEnd();

stream.Close();

}

catch (Exception e)

{

}

return html;

}

根據不同的情況調用不同的方式去獲取，例如：

string _html = Collection.GetHtmlSource("http://www.luohx.com/a.html", "utf-8");

也可以在url參數里面附帶參數，例如

string _html = Collection.GetHtmlSource("http://www.luohx.com/a.aspx?a=1&b=2", "utf-8");

當采集到網站源碼后，會發現，我們的需求往往不是什么代碼都需要，只是需要其中的一部分，比如:標簽<div id=”xml” class” wrap”></div>中間的html，那么，我們需要對源代碼進行截取，方法如下：

#region 獲取畫冊頁面代碼

public string strHtml(string url, string charset)

{

string _html = Collection.GetHtmlSource(url, charset);//根據url獲取網站html

string sss = "";

//正則表達式

string pattern = @"(?six)<div\s+id=""xml""\s+class=""wrap"">

(?'MyCont'

(?>

(?!<div\b|</div>).

<div(?:\s+(?:""[^""]*""|'[^']*'|[^""'>])*)?>(?'div')

</div>(?'-div')

(?(div)(?!))

)

</div>";

foreach (Match m in Regex.Matches(_html, pattern))

{

sss = m.Groups["MyCont"].Value;

}

return sss;

}

#endregion

這里的參數pattern就是針對標簽<div id=”xml” class” wrap”></div>的正則表達式，但是，必須保證，選取的參考對象的唯一的格式，不能同時存在2個或者2個以上的<div id=”xml” class” wrap”></div>，這樣就不能用這個標簽作為參考來判定。

當截取需要的html代碼模塊的時候，我們發現，得到的還是部分的html代碼，如果我們需要的是不包含html元素的內容的時候，就將內容去掉html的標簽，例如：、

public static string checkStr(string html)

{

System.Text.RegularExpressions.Regex regex1 = new System.Text.RegularExpressions.Regex(@"<script[\s\S]+</script *>", System.Text.RegularExpressions.RegexOptions.IgnoreCase);

System.Text.RegularExpressions.Regex regex2 = new System.Text.RegularExpressions.Regex(@" href *= *[\s\S]*script *:", System.Text.RegularExpressions.RegexOptions.IgnoreCase);

System.Text.RegularExpressions.Regex regex3 = new System.Text.RegularExpressions.Regex(@" no[\s\S]*=", System.Text.RegularExpressions.RegexOptions.IgnoreCase);

System.Text.RegularExpressions.Regex regex4 = new System.Text.RegularExpressions.Regex(@"<iframe[\s\S]+</iframe *>", System.Text.RegularExpressions.RegexOptions.IgnoreCase);

System.Text.RegularExpressions.Regex regex5 = new System.Text.RegularExpressions.Regex(@"<frameset[\s\S]+</frameset *>", System.Text.RegularExpressions.RegexOptions.IgnoreCase);

System.Text.RegularExpressions.Regex regex6 = new System.Text.RegularExpressions.Regex(@"\<img[^\>]+\>", System.Text.RegularExpressions.RegexOptions.IgnoreCase); System.Text.RegularExpressions.Regex regex7 = new System.Text.RegularExpressions.Regex(@"</p>", System.Text.RegularExpressions.RegexOptions.IgnoreCase);

System.Text.RegularExpressions.Regex regex8 = new System.Text.RegularExpressions.Regex(@"<p>", System.Text.RegularExpressions.RegexOptions.IgnoreCase);

System.Text.RegularExpressions.Regex regex9 = new System.Text.RegularExpressions.Regex(@"<[^>]*>", System.Text.RegularExpressions.RegexOptions.IgnoreCase);

html = regex1.Replace(html, "");

html = regex2.Replace(html, "");

html = regex3.Replace(html, " _disibledevent=");

html = regex4.Replace(html, "");

html = regex5.Replace(html, "");

html = regex6.Replace(html, "");

html = regex7.Replace(html, "");

html = regex8.Replace(html, "");

html = regex9.Replace(html, "");

html = html.Replace(" ", " ");

html = html.Replace("</strong>", "");

html = html.Replace("<strong>", "");

return html;

}

調用方法很簡單，直接string strhtml= checkStr（html）就可以了，當得到所需要的數據時，就可以入庫、顯示等其他的操作了~

免責聲明！

本站轉載的文章為個人學習借鑒使用，本站對版權不負任何法律責任。如果侵犯了您的隱私權益，請聯系本站郵箱yoyou2525@163.com刪除。

猜您在找 正則表達式——html shell編程值之正則表達式與字符截取(6) 正則表達式——數據提取正則表達式中需要轉義的字符正則表達式需要轉義的字符正則表達式正則表達式正則表達式正則表達式 java 正則表達式提取html純文本