C#數據采集用到的幾個方法


這兩天在做數據采集,因此整理了下數據采集要用到的一些方法。因為我采集的數據比較簡單,所以沒有用到框架。比較有名的兩個框架 HtmlAgilityPack 和 Jumony,感興趣的可以研究下。當然,火車頭采集工具也很方便,不過要付費。下面是整理的代碼:

 

   /// <summary>
    /// Html正則處理幫助類
    /// </summary>
    public class HtmlRegex
    {
        /// <summary>
        /// 匹配所有Html標簽
        /// </summary>
        const string HTMLALLTAG = @"<[^>]+>|</[^>]+>";

        /// <summary>
        /// 刪除所有html標簽
        /// </summary>
        /// <param name="content">原HTML代碼</param>
        /// <returns></returns>
        public static string RemoveAllHtml(string content)
        {
            return Regex.Replace(content, HTMLALLTAG, "");
        }

        /// <summary>
        /// 根據正則匹配獲取指定內容
        /// </summary>
        /// <param name="regStr">正則</param>
        /// <param name="content">原HTML代碼</param>
        /// <param name="hashtml">是否包含HTML標簽</param>
        /// <returns></returns>
        public static string GetStrByRegex(string regStr, string content, bool hashtml = true)
        {
            string result = string.Empty;
            Regex reg = new Regex(regStr);
            Match mth = reg.Match(content);

            if (mth.Success)
            {
                result = mth.Value;
                if (!hashtml) result = HtmlRegex.RemoveAllHtml(result); //去除html標簽

            }
            return result;
        }

        /// <summary>
        /// 獲取指定位置的html代碼
        /// </summary>
        /// <param name="start">起始字符串</param>
        /// <param name="end">結束字符串</param>
        /// <param name="content">原HTML代碼</param>
        /// <param name="hasHtml">是否包含HTML標簽</param>
        /// <returns></returns>
        public static string GetStrByRegex(string start, string end, string content, bool hasHtml = true)
        {
            string result = string.Empty;
            string regStr = @"(?is)(" + start + ").*?(" + end + ")";
            Regex reg = new Regex(regStr);
            Match mth = reg.Match(content);
            if (mth.Success)
            {
                result = mth.Value;
                if (!hasHtml) result = HtmlRegex.RemoveAllHtml(result); //去除html標簽
            }
            return result;
        }

        /// <summary>
        /// 獲取匹配的字符串列表
        /// </summary>
        /// <param name="regStr">正則</param>
        /// <param name="content">原HTML代碼</param>
        /// <returns></returns>
        public static List<string> GetStrListByRegex(string regStr, string content)
        {
            List<string> strList = null;
            MatchCollection mc = null;
            try
            {
                Regex reg = new Regex(regStr);
                mc = reg.Matches(content);
                if (mc.Count > 0)
                {
                    strList = new List<string>();
                    for (int i = 0; i < mc.Count; i++)
                    {
                        strList.Add(mc[i].Value);
                    }
                }
            }
            catch
            {
                strList = null;
            }
            return strList;
        }

        /// <summary>
        /// 獲取匹配的字符串列表
        /// </summary>
        /// <param name="start">起始字符串</param>
        /// <param name="end">結束字符串</param>
        /// <param name="content">原HTML代碼</param>
        /// <returns></returns>
        public static List<string> GetStrListByRegex(string start, string end, string content)
        {
            List<string> strList = null;
            MatchCollection mc = null;
            string regStr = @"(?is)(" + start + ").*?(" + end + ")";
            try
            {
                Regex reg = new Regex(regStr);
                mc = reg.Matches(content);
                if (mc.Count > 0)
                {
                    strList = new List<string>();
                    for (int i = 0; i < mc.Count; i++)
                    {
                        strList.Add(mc[i].Value);
                    }
                }
            }
            catch
            {
                strList = null;
            }
            return strList;
        }

    }

 


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM