文章地址
https://blog.csdn.net/sD7O95O/article/details/78097556
安裝爬蟲框架 NUGET 安裝DotnetSpider
創建HTTP協議數據包
var site = new Site
{
CycleRetryTimes = 1,
SleepTime = 200,
Headers = new Dictionary<string, string>()
{
{"Accept","text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8" },
{"Accept-Encoding","gzip, deflate, br" },
{"Accept-Language","zh-CN,zh;q=0.9" },
{"X-Requested-With","XMLHttpRequest" },
{ "Referer", "https://blog.csdn.net/sD7O95O/article/details/78096027"},
{ "Connection","keep-alive" },
{ "Content-Type","text/html; charset=UTF-8" },
{ "Host","blog.csdn.net"},
{ "User-Agent","Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36"}
}
};
site.AddStartUrl(surl);
創建一個爬蟲
Spider spider = Spider.Create(site, //
new QueueDuplicateRemovedScheduler(),
new CsdnArticleProcessor() //這個自定義的處理器
).AddPipeline(new CsdnArticlePipeline()); //自定義管道
設置爬蟲
spider.Downloader = new HttpClientDownloader();
spider.ThreadNum = 1;
spider.EmptySleepTime = 3000;
運行
spider.Run();
CsdnArticleProcessor 處理器 xpath 分析就不貼了
public class CsdnArticleProcessor : BasePageProcessor
{
protected override void Handle(Page page)
{
//page.Selectable.SelectList(Selectors.XPath("//table[@id='ip_list']/tr[2]/td[2]/text()")).Nodes();
// 利用 Selectable 查詢並構造自己想要的數據對象
//xpath 得到自己需要的數據
//...
//page.AddResultItem("CountryResult", results);放入管道
}
}
class CsdnArticlePipeline : BasePipeline
{
public override void Process(IEnumerable<ResultItems> resultItems, ISpider spider)
{
var conlist = resukt.GetResultItem("CountryResult");獲取數據
//DB處理
}
}
