[DotnetSpider 系列目錄]
上一篇介紹的基本的使用方式,自由度很高,但是編寫的代碼相對就多了。而我所在的行業其實大部分都是定題爬蟲, 只需要采集指定的頁面並結構化數據。為了提高開發效率, 我實現了利用實體配置的方式來實現爬蟲
創建 Console 項目
利用NUGET添加包
DotnetSpider2.Extension
定義配置式數據對象
- 數據對象必須繼承 ISpiderEntity
- Schema 定義數據名稱、表名及表名后綴
- Indexes 定義數據表的主鍵、唯一索引、索引
- EntitySelector 定義從頁面數據中抽取數據對象的規則
定義一個原始的數據對象類
public class Product : ISpiderEntity { }
使用Chrome打開京東商品頁 http://list.jd.com/list.html?cat=9987,653,655&page=2&JL=6_0_0&ms=5#J_main
- 使用快捷鍵F12打開開發者工具
- 選中一個商品,並觀察Html結構
我們發現每個商品都在class為gl-i-wrap j-sku-item的DIV下面,因此添加EntitySelector到數據對象Product的類名上面。( XPath的寫法不是唯一的,不熟悉的可以去W3CSCHOLL學習一下, 框架也支持使用Css甚至正則來選擇出正確的Html片段)。
[EntitySelector(Expression = "//li[@class='gl-item']/div[contains(@class,'j-sku-item')]")] public class Product : ISpiderEntity
-
添加數據庫及索引信息
[Schema("test", "sku", TableSuffix.Today)] [EntitySelector(Expression = "//li[@class='gl-item']/div[contains(@class,'j-sku-item')]")] [Indexes(Index = new[] { "category" }, Unique = new[] { "category,sku", "sku" })] public class Product : ISpiderEntity
-
假設你需要采集SKU信息,觀察HTML結構,計算出相對的XPath, 為什么是相對XPath?因為EntitySelector已經把HTML截成片段了,內部的Html元素查詢都是相對於EntitySelector查詢出來的元素。最后再加上數據庫中列的信息
[Schema("test", "sku", TableSuffix.Today)] [EntitySelector(Expression = "//li[@class='gl-item']/div[contains(@class,'j-sku-item')]")] [Indexes(Index = new[] { "category" }, Unique = new[] { "category,sku", "sku" })] public class Product : ISpiderEntity { [StoredAs("sku", DataType.String, 25)] [PropertySelector(Expression = "./@data-sku")] public string Sku { get; set; } }
-
爬蟲內部,鏈接是通過Request對象來存儲信息的,構造Request對象時可以添加額外的屬性值,這時候允許數據對象從Request的額外屬性值中查詢數據
[StoredAs("category", DataType.String, 20)] [PropertySelector(Expression = "name", Type = SelectorType.Enviroment)] public string CategoryName { get; set; }
配置爬蟲(繼承EntitySpiderBuilder)
protected override EntitySpider GetEntitySpider() { EntitySpider context = new EntitySpider(new Site { //HttpProxyPool = new HttpProxyPool(new KuaidailiProxySupplier("快代理API")) }) { UserId = "DotnetSpider", TaskGroup = "JdSkuSampleSpider" }; context.SetThreadNum(1); context.SetIdentity("JD_sku_store_test_" + DateTime.Now.ToString("yyyy_MM_dd_hhmmss")); context.AddEntityPipeline(new MySqlEntityPipeline("Database='test';Data Source=localhost;User ID=root;Password=1qazZAQ!;Port=3306")); context.AddStartUrl("http://list.jd.com/list.html?cat=9987,653,655&page=2&JL=6_0_0&ms=5#J_main", new Dictionary<string, object> { { "name", "手機" }, { "cat3", "655" } }); context.AddEntityType(typeof(Product), new TargetUrlExtractor { Region = new BaseSelector { Type = SelectorType.XPath, Expression = "//span[@class=\"p-num\"]" }, Patterns = new List<string> { @"&page=[0-9]+&" } }); return context; }
-
其中AddStartUrl第二個參數Dictionary<string, object>就是用於Enviroment查詢的數據
-
配置Scheduler: 默認是使用內存Queue做Url調度,如果想使用多台機器分布式采集則需要配置為RedisScheduler
context.SetScheduler(new RedisScheduler { Host = "", Password = "", Port = 6379 });
-
在添加數據對象時,可以配置數據鏈接的合法性驗證。用在一個網站采集多種鏈接時映射到不同的數據對象。同時此驗證會抽取當前頁面中符合規則的Url加入到Scheduler中繼續采集。
context.AddEntityType(typeof(Product), new TargetUrlExtractor { Region = new BaseSelector { Type = SelectorType.XPath, Expression = "//span[@class=\"p-num\"]" }, Patterns = new List<string> { @"&page=[0-9]+&" } });
-
添加一個MySql的數據管道,只需要配置好連接字符串即可
context.AddEntityPipeline(new MySqlEntityPipeline("Database='test';Data Source=localhost;User ID=root;Password=1qazZAQ!;Port=3306"));
完整代碼
public class JdSkuSampleSpider : EntitySpiderBuilder { protected override EntitySpider GetEntitySpider() { EntitySpider context = new EntitySpider(new Site { //HttpProxyPool = new HttpProxyPool(new KuaidailiProxySupplier("快代理API")) }) { UserId = "DotnetSpider", TaskGroup = "JdSkuSampleSpider" }; context.SetThreadNum(1); context.SetIdentity("JD_sku_store_test_" + DateTime.Now.ToString("yyyy_MM_dd_hhmmss")); context.AddEntityPipeline(new MySqlEntityPipeline("Database='test';Data Source=localhost;User ID=root;Password=1qazZAQ!;Port=3306")); context.AddStartUrl("http://list.jd.com/list.html?cat=9987,653,655&page=2&JL=6_0_0&ms=5#J_main", new Dictionary<string, object> { { "name", "手機" }, { "cat3", "655" } }); context.AddEntityType(typeof(Product), new TargetUrlExtractor { Region = new BaseSelector { Type = SelectorType.XPath, Expression = "//span[@class=\"p-num\"]" }, Patterns = new List<string> { @"&page=[0-9]+&" } }); return context; } [Schema("test", "sku", TableSuffix.Today)] [EntitySelector(Expression = "//li[@class='gl-item']/div[contains(@class,'j-sku-item')]")] [Indexes(Index = new[] { "category" }, Unique = new[] { "category,sku", "sku" })] public class Product : ISpiderEntity { [StoredAs("sku", DataType.String, 25)] [PropertySelector(Expression = "./@data-sku")] public string Sku { get; set; } [StoredAs("category", DataType.String, 20)] [PropertySelector(Expression = "name", Type = SelectorType.Enviroment)] public string CategoryName { get; set; } [StoredAs("cat3", DataType.String, 20)] [PropertySelector(Expression = "cat3", Type = SelectorType.Enviroment)] public int CategoryId { get; set; } [StoredAs("url", DataType.Text)] [PropertySelector(Expression = "./div[1]/a/@href")] public string Url { get; set; } [StoredAs("commentscount", DataType.String, 32)] [PropertySelector(Expression = "./div[5]/strong/a")] public long CommentsCount { get; set; } [StoredAs("shopname", DataType.String, 100)] [PropertySelector(Expression = ".//div[@class='p-shop']/@data-shop_name")] public string ShopName { get; set; } [StoredAs("name", DataType.String, 50)] [PropertySelector(Expression = ".//div[@class='p-name']/a/em")] public string Name { get; set; } [StoredAs("venderid", DataType.String, 25)] [PropertySelector(Expression = "./@venderid")] public string VenderId { get; set; } [StoredAs("jdzy_shop_id", DataType.String, 25)] [PropertySelector(Expression = "./@jdzy_shop_id")] public string JdzyShopId { get; set; } [StoredAs("run_id", DataType.Date)] [PropertySelector(Expression = "Monday", Type = SelectorType.Enviroment)] public DateTime RunId { get; set; } [PropertySelector(Expression = "Now", Type = SelectorType.Enviroment)] [StoredAs("cdate", DataType.Time)] public DateTime CDate { get; set; } } }
運行爬蟲
public class Program { public static void Main(string[] args) { JdSkuSampleSpider spiderBuilder = new JdSkuSampleSpider(); spiderBuilder.Run("rerun"); } }
不到100行代碼完成一個爬蟲,是不是異常的簡單?