點擊我前往Github查看源代碼 別忘記star
本項目github地址:https://github.com/wangqifan/ZhiHu
如果你覺得服務商的服務太貴,可以考慮自建一個代理池。雲代理推薦阿布雲:https://www.abuyun.com/
應用場景
爬蟲過於頻繁的抓取網站信息會被反爬蟲機制屏蔽掉,或者有些網站對我們的Ip有限制,一個IP之能操作一次,這個時候就需要設置代理了。這方面需求還是很大的,有專門的服務商提供代理,沒錢的自己動手打造一個代理池吧。
所用的工具
Redis的C#驅動-ServiceStack.Redis
Html解析-HtmlAgilityPack 任務調度-Quartz.NET
基本原理
部分網站上有免費的代理IP信息,比如xicidaili.com,proxy360.cn。這些網站有很多免費代理IP,然而有些質量不好,需要程序及時從代理池中刪掉質量低的代理,不斷加入優質代理。
思路來自知乎-https://www.zhihu.com/question/25566731

原理示意圖

接下來代碼實現
創建一個ProxyPool的控制台應用程序,並使用NuGET添加ServiceStack.Redis,HtmlAgilityPack,Quartz.NET包
創建一個Proxy類
public class Proxy { [Key] public string Adress { get; set; } public int port { get; set; } }
封裝一個資源獲取方法
public string DownloadHtml(string url) { string source = string.Empty; try { HttpWebRequest request = (HttpWebRequest)WebRequest.Create(url); request.UserAgent = "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:49.0) Gecko/20100101 Firefox/49.0"; using (HttpWebResponse response = (HttpWebResponse)request.GetResponse()) { using (Stream dataStream = response.GetResponseStream()) { if (response.ContentEncoding.ToLower().Contains("gzip"))//解壓 { using (GZipStream stream = new GZipStream(response.GetResponseStream(), CompressionMode.Decompress)) { using (StreamReader reader = new StreamReader(stream, Encoding.UTF8)) { source = reader.ReadToEnd(); } } } else if (response.ContentEncoding.ToLower().Contains("deflate"))//解壓 { using (DeflateStream stream = new DeflateStream(response.GetResponseStream(), CompressionMode.Decompress)) { using (StreamReader reader = new StreamReader(stream, Encoding.UTF8)) { source = reader.ReadToEnd(); } } } else { using (Stream stream = response.GetResponseStream())//原始 { using (StreamReader reader = new StreamReader(stream, Encoding.UTF8)) { source = reader.ReadToEnd(); } } } } } request.Abort(); } catch { } return source; }
檢測代理是否有效
public static bool IsAvailable(Proxy proxy) { bool result = false; try { HttpWebRequest request = (HttpWebRequest)WebRequest.Create("https://www.baidu.com/"); request.UserAgent = "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:49.0) Gecko/20100101 Firefox/49.0"; WebProxy webproxy=new WebProxy(proxy.Adress,proxy.port); request.Proxy=webproxy; request.Timeout = 1000; HttpWebResponse response = (HttpWebResponse)request.GetResponse(); using (Stream dataStream = response.GetResponseStream()) { using (StreamReader reader = new StreamReader(dataStream, Encoding.UTF8)) { if (reader.ReadToEnd().Contains("百度")) { result = true; } } } request.Abort(); } catch { } return result; }
將代理添加到Redis的hash表
pblic void Add(Proxy proxy) { using (RedisClient client = new RedisClient("127.0.0.1", 6379)) { if (IsAvailable(proxy)) { Console.WriteLine(proxy.Adress); client.AddItemToSet("ProxyPool", proxy.Adress + ":" + proxy.port.ToString()); } } }
下載西刺代理
public void Downloadxicidaili(object DATA)//下載西刺代理的html頁面 { try { List<string> list = new List<string>() { "http://www.xicidaili.com/nt/", "http://www.xicidaili.com/nn/", "http://www.xicidaili.com/wn/", "http://www.xicidaili.com/wt/" }; foreach (var utlitem in list) { string url = utlitem; string html = DownloadHtml(url); HtmlDocument doc = new HtmlAgilityPack.HtmlDocument(); doc.LoadHtml(html); HtmlNode node = doc.DocumentNode; string xpathstring = "//tr[@class='odd']"; HtmlNodeCollection collection = node.SelectNodes(xpathstring); foreach (var item in collection) { Proxy proxy = new Proxy(); string xpath = "td[2]"; proxy.Adress = item.SelectSingleNode(xpath).InnerHtml; xpath = "td[3]"; proxy.port = int.Parse(item.SelectSingleNode(xpath).InnerHtml); Console.WriteLine(proxy.Adress); Add(proxy); } } Console.WriteLine("西刺"); }catch { } }
快代理
public void Downkuaidaili(object DATA)//下載快代理 { try { string url = "http://www.xicidaili.com/nt/"; for (int i = 1; i < 4; i++) { string html = DownloadHtml(url+i.ToString()); string xpath = "//tbody/tr"; HtmlDocument doc = new HtmlAgilityPack.HtmlDocument(); doc.LoadHtml(html); HtmlNode node = doc.DocumentNode; HtmlNodeCollection collection = node.SelectNodes(xpath); foreach (var item in collection) { Proxy proxy = new Proxy(); proxy.Adress = item.FirstChild.InnerHtml; xpath = "td[2]"; proxy.port = int.Parse(item.SelectSingleNode(xpath).InnerHtml); Console.WriteLine(proxy.Adress); Add(proxy); } } } catch { } }
Proxy360
public void Downloadproxy360(object DATA)//下載proxy360 { try { string url = "http://www.proxy360.cn/default.aspx"; string html = DownloadHtml(url); HtmlDocument doc = new HtmlAgilityPack.HtmlDocument(); doc.LoadHtml(html); string xpathstring = "//div[@class='proxylistitem']"; HtmlNode node = doc.DocumentNode; HtmlNodeCollection collection = node.SelectNodes(xpathstring); foreach (var item in collection) { Proxy proxy = new Proxy(); var childnode = item.ChildNodes[1]; xpathstring = "span[1]"; proxy.Adress = childnode.SelectSingleNode(xpathstring).InnerHtml.Trim(); xpathstring = "span[2]"; proxy.port = int.Parse(childnode.SelectSingleNode(xpathstring).InnerHtml); Console.WriteLine(proxy.Adress); Add(proxy); } Console.WriteLine("proxy360"); } catch { } }
多線程爬取
public void Initial() { ThreadPool.QueueUserWorkItem(Downloadxicidaili); ThreadPool.QueueUserWorkItem(Downkuaidaili); ThreadPool.QueueUserWorkItem(Downloadproxy360); }
刪除接口和隨機獲取接口
public class Pool { public static string GetProxy() { string result=string.Empty; try { using (RedisClient client = new RedisClient("59.74.169.57", 6379)) { result = client.GetRandomItemFromSet("ProxyPool"); } } catch { } return result; } public static void PushProxy(string value) { try { using (RedisClient client = new RedisClient("59.74.169.57", 6379)) { client.RemoveItemFromSet("ProxyPool", value); } } catch { Console.WriteLine("刪除代理失敗!"); } } }
不斷檢測代理池
public void TestAll() { while(true) { string Proxy= Pool.GetProxy(); if (!string.IsNullOrEmpty(Proxy)) { Proxy webproxy = new ProxyPool.Proxy(); int index = Proxy.IndexOf(":"); webproxy.Adress = Proxy.Substring(0, index); webproxy.port = int.Parse(Proxy.Substring(index + 1, Proxy.Length - index - 1)); if (!IsAvailable(webproxy)) { Pool.PushProxy(Proxy); } } Thread.Sleep(500); } }
job類
class TotalJob:IJob { public void Execute(IJobExecutionContext context) { PoolManage manager = new PoolManage(); manager.Initial(); } }
任務聲明
static void Main(string[] args) { Run(); Console.WriteLine("Press any key to close the application"); } private static void Run() { try { StdSchedulerFactory factory = new StdSchedulerFactory(); IScheduler scheduler = factory.GetScheduler(); scheduler.Start(); IJobDetail job = JobBuilder.Create<TotalJob>().WithIdentity("job1", "group1").Build(); ITrigger trigger = TriggerBuilder.Create() .WithIdentity("trigger1", "group1") .StartNow() .WithSimpleSchedule( x => x .WithIntervalInMinutes(1) .RepeatForever() ).Build(); scheduler.ScheduleJob(job, trigger); } catch (SchedulerException se) { Console.WriteLine(se); } }
Github地址:https://github.com/wangqifan/ProxyPool
