.Net HttpWebRequest 爬蟲核心爬取


1 爬蟲,爬蟲攻防

2 下載html

3 xpath解析html,獲取數據和深度抓取(和正則匹配)

4 多線程抓取

熟悉http協議

提供兩個方法Post和Get

public static string HttpGet(string url, Encoding encoding = null,  Dictionary<string,string> headDic=null)
        {
            string html = string.Empty;
            try
            {
                HttpWebRequest request = HttpWebRequest.Create(url) as HttpWebRequest;//模擬請求
                request.Timeout = 30 * 1000;
                request.UserAgent = "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.106 Safari/537.36";
                request.ContentType = "text/html; charset=utf-8";
                if (headDic != null)
                {
                    foreach (var item in headDic)
                    {
                        request.Headers.Add(item.Key, item.Value);
                    }
                }
                if(encoding==null)
                    encoding = Encoding.UTF8; // 如果是亂碼就改成 utf-8 / GB2312
                else
                    encoding=Encoding.GetEncoding("GB2312");
                using (HttpWebResponse response = request.GetResponse() as HttpWebResponse)
                {
                    if (response.StatusCode != HttpStatusCode.OK)
                    {
                        log.Warn(string.Format("抓取{0}地址返回失敗,response.StatusCode為{1}", url, response.StatusCode));
                    }
                    else
                    {
                        try
                        {
                            StreamReader sr = new StreamReader(response.GetResponseStream(), encoding);
                            html = sr.ReadToEnd();//讀取數據
                            sr.Close();
                        }
                        catch (Exception ex)
                        {
                            log.Error(string.Format("DownloadHtml抓取{0}保存失敗", url), ex);
                            html = null;
                        }
                    }
                }

            }
            catch (WebException ex)
            {
                if (ex.Message.Equals("遠程服務器返回錯誤: (306)。"))
                {
                    log.Error("遠程服務器返回錯誤: (306)。", ex);
                    return null;
                }
            }
            catch (Exception ex)
            {
                log.Error(string.Format("DownloadHtml抓取{0}出現異常", url), ex);
                html = null;
            }
            return html;
        }

 

        /// <summary>
        /// Post 調用借口
        /// </summary>
        /// <param name="url">接口地址</param>
        /// <param name="value">接口參數</param>
        /// <returns></returns>
        public static string HttpPost(string url, string value)
        {
            string param = value;
            Stream stream = null;
            byte[] postData = Encoding.UTF8.GetBytes(param);
            try
            {
                HttpWebRequest myRequest = (HttpWebRequest)WebRequest.Create(url);

                myRequest.Method = "POST";
                myRequest.ContentType = "application/x-www-form-urlencoded";
                myRequest.ContentLength = postData.Length;
                stream = myRequest.GetRequestStream();
                stream.Write(postData, 0, postData.Length);

                HttpWebResponse myResponse = (HttpWebResponse)myRequest.GetResponse();
                if (myResponse.StatusCode == HttpStatusCode.OK)
                {
                    StreamReader sr = new StreamReader(myResponse.GetResponseStream(), Encoding.UTF8);
                    string rs = sr.ReadToEnd().Trim();
                    sr.Close();
                    return rs;
                }
                else
                {
                    return "失敗:Status:" + myResponse.StatusCode.ToString();
                }
            }
            catch (Exception ex)
            {
                return "失敗:ex:" + ex.ToString();
            }
            finally
            {
                if (stream != null)
                {
                    stream.Close();
                    stream.Dispose();
                }
            }
        }

下載Html

StreamWriter sw = new StreamWriter("路徑.txt", true, Encoding.GetEncoding("utf-8"));
sw.Write("爬取的html字符串");
sw.Close();

 xpath

http://www.cnblogs.com/zhaozhan/archive/2009/09/09/1563617.html

http://www.cnblogs.com/zhaozhan/archive/2009/09/09/1563679.html

http://www.cnblogs.com/zhaozhan/archive/2009/09/10/1563703.html

正則匹配

目前使用起來最好用的正則

 <title>(?<html>[\s\S]+?)</title>  意思是匹配 <title> *********</title>標簽里面的任意字符串

 

Regex reTitle = new Regex(@"<title>(?<html>[\s\S]+?)</title>"/>");
string title = reTitle.Match(html).Groups["html"].Value;

 多個選擇

Regex rgInfo = new Regex(@"<td align=""left"">(?<company>[^<>]+)</td><td align=""center"">(?<id>[\dA-Z]+)</td><td align=""center"">(?<cat>[^<>]+)</td><td align=""center"">(?<grade>[A-Z]+)</td><td align=""center"">(?<date>[^\s&]*)");
MatchCollection mchInfos = rgInfo.Matches(strHtml);
foreach (Match m in mchInfos)
{
    string strCompany = m.Groups["company"].Value;
    string strId = m.Groups["id"].Value;
    string strCat = m.Groups["cat"].Value.Replace("&nbsp;", "");
    string grade = m.Groups["grade"].Value;
    string date = m.Groups["date"].Value;
}

多線程

List<Task> taskList = new List<Task>();
TaskFactory taskFactory = new TaskFactory();
for(int i=0;i<100;i++)
{
   taskList.Add(taskFactory.StartNew(Crawler));//將一個執行Crawler方法的線程放到集合里面,創建並啟動 任務
   if (taskList.Count > 15) //線程池啟動15個線程
   {
        taskList = taskList.Where(t => !t.IsCompleted && !t.IsCanceled && !t.IsFaulted).ToList();
        Task.WaitAny(taskList.ToArray());//有線程執行完畢
    }  
}
Task.WaitAll(taskList.ToArray());//100個線程全部執行完成
Console.WriteLine("抓取全部完成 - -", DateTime.Now);

該文檔只是自己記錄,純屬記事本

 


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM