目錄:信息采集入門系列目錄
下面記錄的是我自己整理的C#請求頁面核心類,主要有如下幾個方法
1.HttpWebRequest Get請求獲得頁面html
2.HttpWebRequest Post請求獲得頁面html
3.模擬登錄獲得cookie內容
4.模擬登錄獲得cookie字符串
5.代理的設置
6.利用webbrowser 獲取js生成的頁面
7.為webbrowser設置cookie,模擬登錄
8.使用demo
HttpWebRequest Get請求獲得頁面html
注意點:以前抓取覺得很慢,最后發現是代理的問題,沒有代理就設置為null,這樣就不用每次去找代理,影響執行效率,還有一些參數可以自習設置,比如模擬瀏覽器等。
/// <summary> /// get請求獲得頁面的html /// </summary> /// <param name="url">需要獲取的url</param> /// <param name="proxy">代理,沒有設置為null,不然每次去讀代理造成請求很慢</param> /// <param name="cookie">該網站所需要的cookie</param> /// <param name="timeout">超時時間</param> /// <returns>頁面請求后的html</returns> public static string Crawl(string url, WebProxy proxy, CookieContainer cookie, int timeout = 10000) { string result = string.Empty; HttpWebRequest request = null; WebResponse response = null; StreamReader streamReader = null; try { request = (HttpWebRequest)HttpWebRequest.Create(url); request.Proxy = proxy; request.Timeout = timeout; request.AllowAutoRedirect = true; request.CookieContainer = cookie; response = (HttpWebResponse)request.GetResponse(); streamReader = new StreamReader(response.GetResponseStream(), Encoding.UTF8); result = streamReader.ReadToEnd(); } catch (Exception ex) { throw ex; } finally { if (request != null) { request.Abort(); } if (response != null) { response.Close(); } if (streamReader != null) { streamReader.Dispose(); } } return result; }
HttpWebRequest Post請求獲得頁面html
/// <summary> /// post請求獲得頁面 /// </summary> /// <param name="url">需要獲取的url</param> /// <param name="postdata">post的數據字符串,如id=1&name=test</param> /// <param name="proxy">代理</param> /// <param name="cookie">coolie</param> /// <param name="timeout">超時</param> /// <returns></returns> public static string Crawl(string url, string postdata,WebProxy proxy, CookieContainer cookie, int timeout = 10000) { string result = string.Empty; HttpWebRequest request = null; WebResponse response = null; StreamReader streamReader = null; try { request = (HttpWebRequest)HttpWebRequest.Create(url); request.Proxy = proxy; request.Timeout = timeout; request.AllowAutoRedirect = true; request.CookieContainer = cookie; byte[] bs = Encoding.ASCII.GetBytes(postdata); string responseData = String.Empty; request.Method = "POST"; request.ContentType = "application/x-www-form-urlencoded"; request.ContentLength = bs.Length; using (Stream reqStream = request.GetRequestStream()) { reqStream.Write(bs, 0, bs.Length); reqStream.Close(); } response = (HttpWebResponse)request.GetResponse(); streamReader = new StreamReader(response.GetResponseStream(), Encoding.UTF8); result = streamReader.ReadToEnd(); } catch (Exception ex) { throw ex; } finally { if (request != null) { request.Abort(); } if (response != null) { response.Close(); } if (streamReader != null) { streamReader.Dispose(); } } return result; }
模擬登錄獲得cookie內容
先找到登錄的頁面,分析登錄頁面的post參數和鏈接,獲得cookie后可以直接傳到上面的方法
/// <summary> ///根據模擬請求頁面獲得cookie /// </summary> /// <param name="url">模擬的url</param> /// <returns>cookie</returns> public static CookieContainer GetCookie(string url, WebProxy proxy, int timeout = 10000) { HttpWebRequest request = null; HttpWebResponse response = null; try { CookieContainer cc = new CookieContainer(); request = (HttpWebRequest)HttpWebRequest.Create(url); request.Proxy = proxy; request.Timeout = timeout; request.AllowAutoRedirect = true; request.CookieContainer = cc; response = (HttpWebResponse)request.GetResponse(); response.Cookies = request.CookieContainer.GetCookies(request.RequestUri); return cc; } catch (Exception ex) { throw ex; } finally { if (request != null) { request.Abort(); } if (response != null) { response.Close(); } } }
模擬登錄獲得cookie字符串
/// <summary> /// 獲得cookie字符串,webbrowser可以使用 /// </summary> /// <param name="url"></param> /// <param name="proxy"></param> /// <param name="timeout"></param> /// <returns></returns> public static string GetCookieString(string url, WebProxy proxy, int timeout = 10000) { HttpWebRequest request = null; HttpWebResponse response = null; try { CookieContainer cc = new CookieContainer(); request = (HttpWebRequest)HttpWebRequest.Create(url); request.Proxy = proxy; request.Timeout = timeout; request.AllowAutoRedirect = true; request.CookieContainer = cc; response = (HttpWebResponse)request.GetResponse(); response.Cookies = request.CookieContainer.GetCookies(request.RequestUri); string strcrook = request.CookieContainer.GetCookieHeader(request.RequestUri); return strcrook; } catch (Exception ex) { throw ex; } finally { if (request != null) { request.Abort(); } if (response != null) { response.Close(); } } }
代理的設置
/// <summary> /// 創建代理 /// </summary> /// <param name="port">代理端口</param> /// <param name="user">用戶名</param> /// <param name="password">密碼</param> /// <returns></returns> public static WebProxy CreatePorxy(string port, string user, string password) { WebProxy proxy = new WebProxy(); proxy.Address = new Uri(port); proxy.Credentials = new NetworkCredential(user, password); return proxy; }
利用webbrowser 獲取js生成的頁面
說明:由於不知道頁面什么時候執行完成,這里是等待5s,默認執行完成,效率有待提高。
另外執行需要線程安全添加[STAThread]
/// <summary> /// 抓取js生成的頁面 /// </summary> /// <param name="url"></param> /// <returns></returns> public static string CrawlDynamic(string url) { WebBrowser browser = new WebBrowser(); browser.ScriptErrorsSuppressed = true; browser.Navigate(url); //先要等待加載完畢 while (browser.ReadyState != WebBrowserReadyState.Complete) { Application.DoEvents(); } System.Timers.Timer timer = new System.Timers.Timer(); var isComplete = false; timer.Elapsed += new System.Timers.ElapsedEventHandler((sender, e) => { //加載完畢 isComplete = true; timer.Stop(); }); timer.Interval = 1000 * 5; timer.Start(); //繼續等待 5s,等待js加載完 while (!isComplete) Application.DoEvents(); var htmldocument = browser.Document; return htmldocument.ActiveElement.InnerHtml; }
為webbrowser設置cookie,模擬登錄
剛開始始終不成功以為這個方法不能用,后面發現原來是doain設置有問題,我的例子是www.aa.xxx.com,設置的為http://xx.com可以使用,這個地方可能需要根據自己的情況來選擇域名。
[DllImport("wininet.dll", CharSet = CharSet.Auto, SetLastError = true)] public static extern bool InternetSetCookie(string lpszUrlName, string lbszCookieName, string lpszCookieData); /// <summary> /// 為webbrowser設置cookie /// </summary> /// <param name="cookieStr">cookie字符串,可以從上面方法獲得</param> /// <param name="domain">需要設置的域名</param> public static void SetCookie(string cookieStr,string domain) { foreach (string c in cookieStr.Split(';')) { string[] item = c.Split('='); if (item.Length == 2) { string name = item[0]; string value = item[1]; InternetSetCookie(domain, name, value); } } }
使用demo
//代理,沒有就直接傳null WebProxy proxy = WebCrawl.WebRequestHelper.CreatePorxy("xx.com", "user", "password"); //根據登錄頁得到cookie CookieContainer cookie = WebCrawl.WebRequestHelper.GetCookie("http://xxxx.login.com", proxy); //獲取頁面 string html = WebCrawl.WebRequestHelper.Crawl("http://xxx.index.com", proxy, cookie); //根據登錄頁得到cookie字符串 string cookiestr = WebCrawl.WebRequestHelper.GetCookieString("http://xxxx.login.com", proxy); //為webbrowser設置cookie WebCrawl.WebRequestHelper.SetCookie(cookiestr, "https://xx.com"); //獲取需要登錄切用js生成的頁面,當然普通頁面也可以 string htmlWithJs = WebCrawl.WebRequestHelper.CrawlDynamic("http://xxx.index.com");