需要記住的,隨筆記一下
1、抓取遠程網頁源碼,這里要實現自動判斷網頁編碼,否則有可能抓到亂碼。我是先看應答的 http頭的chareset,一般這個很准,但像csdn的新聞比較變態http應答的頭里的chareset和網頁的meta里聲明的 chareset不一致,所以我手工加了一下判斷,如果不一致再在內存流里用網頁聲明的編碼讀取一遍源碼 2、把網頁分割成幾大塊。試用了一下tidy的.net包裝及HtmlParse的.net版本,都不太好用。於是我自己寫了個算法,可以把網頁里的div塊,td塊等都提取出來,支持嵌套的情況。一般只提取div的文字塊兒就行了。 3、把漢字少於200的文本塊去了,一般少於200字的文本塊不會是正文,即便是正文,一般來說也不會有太多的價值,我直接去掉。 4、 因為div支持嵌套,所以剩下的文本塊,有可能是重復的,一個是另一個的父節點,所以要把最里層的文本塊找出來,最里層的文本塊肯定是漢字最多的,而其它 文本最少的,所以要計算出剩余文本塊中漢字占所有字符比例最高的文本塊,基本上它就是正文的文本塊了。當然有的網頁正文里也可能還有div的文本塊,這時 候可能會判斷錯誤,但只要正文嵌套的Div文本塊的漢字少於200字,我的算法還是能准確提取正文文本塊的。這一步我用寫了一個自定義的方法傳遞給 List的Sort方法。 5、把<p><br>等標簽替換成特殊占位符[p][br]等,因為最終的正文需要保留段落和回車換行等格式。這一步用正則實現。 6、把最后剩下的文本塊的html標簽去掉,我用正則過濾的。 7、把[p]替換成回車換行加倆空格,把[br]替換成回車換行,這步也用正則。到此,正文提取完畢
主要代碼:
[csharp]
view plain copy

- <span style="font-size:18px;">public class GetMainContentHelper
- {
- ///<summary>
- /// 判斷兩段兒文本里哪個中文占的比例高
- ///</summary>
- ///<param name="x"></param>
- ///<param name="y"></param>
- ///<returns></returns>
- public static int CompareDinosByChineseLength(string x, string y)
- {
- if (x == null)
- {
- if (y == null)
- {
- return 0;
- }
- else
- {
- return -1;
- }
- }
- else
- {
- if (y == null)
- {
- return 1;
- }
- else
- {
- Regex r = new Regex("[\u4e00-\u9fa5]");
- float xCount = (float)(r.Matches(x).Count) / (float)x.Length;
- float yCount = (float)(r.Matches(y).Count) / (float)y.Length;
- int retval = xCount.CompareTo(yCount);
- if (retval != 0)
- {
- return retval;
- }
- else
- {
- return x.CompareTo(y);
- }
- }
- }
- }
- ///<summary>
- /// 獲取一個網頁源碼中的標簽列表,支持嵌套,一般或去div,td等容器
- ///</summary>
- ///<param name="input"></param>
- ///<param name="tag"></param>
- ///<returns></returns>
- public static List<string> GetTags(string input, string tag)
- {
- StringReader strReader = new StringReader(input);
- int lowerThanCharCounter = 0;
- int lowerThanCharPos = 0;
- Stack<int> tagPos = new Stack<int>();
- List<string> taglist = new List<string>();
- int i = 0;
- while (true)
- {
- try
- {
- int intCharacter = strReader.Read();
- if (intCharacter == -1) break;
- char convertedCharacter = Convert.ToChar(intCharacter);
- if (lowerThanCharCounter > 0)
- {
- if (convertedCharacter == '>')
- {
- lowerThanCharCounter--;
- string biaoqian = input.Substring(lowerThanCharPos, i - lowerThanCharPos + 1);
- if (biaoqian.StartsWith(string.Format("<{0}", tag)))
- {
- tagPos.Push(lowerThanCharPos);
- }
- if (biaoqian.StartsWith(string.Format("</{0}", tag)))
- {
- if (tagPos.Count < 1)
- continue;
- int tempTagPos = tagPos.Pop();
- string strdiv = input.Substring(tempTagPos, i - tempTagPos + 1);
- taglist.Add(strdiv);
- }
- }
- }
- if (convertedCharacter == '<')
- {
- lowerThanCharCounter++;
- lowerThanCharPos = i;
- }
- }
- finally
- {
- i++;
- }
- }
- return taglist;
- }
- ///<summary>
- /// 獲取指定網頁的源碼,支持編碼自動識別
- ///</summary>
- ///<param name="url"></param>
- ///<returns></returns>
- public static string getDataFromUrl(string url)
- {
- string str = string.Empty;
- HttpWebRequest request = (HttpWebRequest)HttpWebRequest.Create(url);
- //設置http頭
- request.AllowAutoRedirect = true;
- request.AllowWriteStreamBuffering = true;
- request.Referer = "";
- request.Timeout = 10 * 1000;
- request.UserAgent = "";
- HttpWebResponse response = null;
- try
- {
- response = (HttpWebResponse)request.GetResponse();
- if (response.StatusCode == HttpStatusCode.OK)
- {
- //根據http應答的http頭來判斷編碼
- string characterSet = response.CharacterSet;
- Encoding encode;
- if (characterSet != "")
- {
- if (characterSet == "ISO-8859-1")
- {
- characterSet = "gb2312";
- }
- encode = Encoding.GetEncoding(characterSet);
- }
- else
- {
- encode = Encoding.Default;
- }
- //聲明一個內存流來保存http應答流
- Stream receiveStream = response.GetResponseStream();
- MemoryStream mStream = new MemoryStream();
- byte[] bf = new byte[255];
- int count = receiveStream.Read(bf, 0, 255);
- while (count > 0)
- {
- mStream.Write(bf, 0, count);
- count = receiveStream.Read(bf, 0, 255);
- }
- receiveStream.Close();
- mStream.Seek(0, SeekOrigin.Begin);
- //從內存流里讀取字符串
- StreamReader reader = new StreamReader(mStream, encode);
- char[] buffer = new char[1024];
- count = reader.Read(buffer, 0, 1024);
- while (count > 0)
- {
- str += new String(buffer, 0, count);
- count = reader.Read(buffer, 0, 1024);
- }
- //從解析出的字符串里判斷charset,如果和http應答的編碼不一直
- //那么以頁面聲明的為准,再次從內存流里重新讀取文本
- Regex reg =
- new Regex(@"<meta[\s\S]+?charset=(.*)""[\s\S]+?>",
- RegexOptions.Multiline | RegexOptions.IgnoreCase);
- MatchCollection mc = reg.Matches(str);
- if (mc.Count > 0)
- {
- string tempCharSet = mc[0].Result("$1");
- if (string.Compare(tempCharSet, characterSet, true) != 0)
- {
- encode = Encoding.GetEncoding(tempCharSet);
- str = string.Empty;
- mStream.Seek(0, SeekOrigin.Begin);
- reader = new StreamReader(mStream, encode);
- buffer = new char[255];
- count = reader.Read(buffer, 0, 255);
- while (count > 0)
- {
- str += new String(buffer, 0, count);
- count = reader.Read(buffer, 0, 255);
- }
- }
- }
- reader.Close();
- mStream.Close();
- }
- }
- catch (Exception ex)
- {
- Trace.TraceError(ex.ToString());
- }
- finally
- {
- if (response != null)
- response.Close();
- }
- return str;
- }
- ///<summary>
- /// 從一段網頁源碼中獲取正文
- ///</summary>
- ///<param name="input"></param>
- ///<returns></returns>
- public static string GetMainContent(string input)
- {
- string reg1 = @"<(p|br)[^<]*>";
- string reg2 =
- @"(
([^=]*)(=[^]*)?\][\s\S]*?/\1)|(?<lj>(?=[^\u4E00-\u9FA5\uFE30-\uFFA0,."");])<a\s+[^>]*>[^<]{2,}</a>(?=[^\u4E00-\u9FA5\uFE30-\uFFA0,."");]))|(?<Style><style[\s\S]+?/style>)|(?<select><select[\s\S]+?/select>)|(?<Script><script[\s\S]*?/script>)|(?<Explein><\!\-\-[\s\S]*?\-\->)|(?<li><li(\s+[^>]+)?>[\s\S]*?/li>)|(?<Html></?\s*[^> ]+(\s*[^=>]+?=['""]?[^""']+?['""]?)*?[^\[<]*>)|(?<Other>&[a-zA-Z]+;)|(?<Other2>\#[a-z0-9]{6})|(?<Space>\s+)|(\&\#\d+\;)";
- //1、獲取網頁的所有div標簽
- List<string> list = GetTags(input, "div");
- //2、去除漢字少於200字的div
- List<string> needToRemove = new List<string>();
- foreach (string s in list)
- {
- Regex r = new Regex("[\u4e00-\u9fa5]");
- if (r.Matches(s).Count < 300)
- {
- needToRemove.Add(s);
- }
- }
- foreach (string s in needToRemove)
- {
- list.Remove(s);
- }
- //3、把剩下的div按漢字比例多少倒序排列,
- list.Sort(CompareDinosByChineseLength);
- if (list.Count < 1)
- {
- return "";
- }
- input = list[list.Count - 1];
- //4、把p和br替換成特殊的占位符[p][br]
- input = new Regex(reg1, RegexOptions.Multiline | RegexOptions.IgnoreCase).Replace(input, "[$1]");
- //5、去掉HTML標簽,保留漢字
- input = new Regex(reg2, RegexOptions.Multiline | RegexOptions.IgnoreCase).Replace(input, "");
- //6、把特殊占維護替換成回車和換行
- input = new Regex("\\[p]", RegexOptions.Multiline | RegexOptions.IgnoreCase).Replace(input, "\r\n ");
- input = new Regex("\\[br]", RegexOptions.Multiline | RegexOptions.IgnoreCase).Replace(input, "\r\n");
- return input;
- }
- }
