[DotnetSpider 系列目錄]
場景模擬
接上一篇, 假設由於漏存JD SKU對應的店鋪信息。這時我們需要重新完全采集所有的SKU數據嗎?補爬的話歷史數據就用不了了。因此,去京東頁面上找看是否有提供相關的接口。
查找API請求接口
-
安裝 Fiddler, 並打開
-
在Fiddler查找一條條的訪問記錄,找到我們想要的接口
編寫爬蟲
-
分析返回的數據結果,我們可以先寫出數據對象的定義(觀察Expression的值已經是JsonPath查詢表達式了,同時Type必須設置為Type = SelectorType.JsonPath)。另外需要注意的是,這次的爬蟲是更新型爬蟲,就是說采集到的數據補充回原表,那么就一定要設置主鍵是什么,即在數據類上添加主鍵的定義
[Schema("jd", "sku_v2", TableSuffix.Monday)] [EntitySelector(Expression = "$.[*]", Type = SelectorType.JsonPath)] [Indexes(Primary = "sku")] public class ProductUpdater : ISpiderEntity { [StoredAs("sku", DataType.String, 25)] [PropertySelector(Expression = "$.pid", Type = SelectorType.JsonPath)] public string Sku { get; set; } [StoredAs("shopname", DataType.String, 100)] [PropertySelector(Expression = "$.seller", Type = SelectorType.JsonPath)] public string ShopName { get; set; } [StoredAs("shopid", DataType.String, 25)] [PropertySelector(Expression = "$.shopId", Type = SelectorType.JsonPath)] public string ShopId { get; set; } }
-
定義Pipeline的類型為Update
context.AddEntityPipeline(new MySqlEntityPipeline { ConnectString = "Database='taobao';Data Source= ;User ID=root;Password=1qazZAQ!;Port=4306", Mode = PipelineMode.Update });
-
由於返回的數據中還有一個json()這樣的pagging,所以需要先做一個截取操作,框架提供了PageHandler接口,並且我們實現了大量常用的Handler,用於HTML的解析前的一些處理操作,因此完整的代碼如下
public class JdShopDetailSpider : EntitySpiderBuilder { protected override EntitySpider GetEntitySpider() { var context = new EntitySpider(new Site()) { TaskGroup = "JD SKU Weekly", Identity = "JD Shop details " + DateTimeUtils.MondayRunId, CachedSize = 1, ThreadNum = 8, Downloader = new HttpClientDownloader { DownloadCompleteHandlers = new IDownloadCompleteHandler[] { new SubContentHandler { Start = "json(", End = ");", StartOffset = 5, EndOffset = 0 } } }, PrepareStartUrls = new PrepareStartUrls[] { new BaseDbPrepareStartUrls() { Source = DataSource.MySql, ConnectString = "Database='test';Data Source= localhost;User ID=root;Password=1qazZAQ!;Port=3306", QueryString = $"SELECT * FROM jd.sku_v2_{DateTimeUtils.MondayRunId} WHERE shopname is null or shopid is null order by sku", Columns = new [] {new DataColumn { Name = "sku"} }, FormateStrings = new List<string> { "http://chat1.jd.com/api/checkChat?my=list&pidList={0}&callback=json" } } } }; context.AddEntityPipeline(new MySqlEntityPipeline { ConnectString = "Database='taobao';Data Source=localhost ;User ID=root;Password=1qazZAQ!;Port=4306", Mode = PipelineMode.Update }); context.AddEntityType(typeof(ProductUpdater), new TargetUrlExtractor { Region = new Selector { Type = SelectorType.XPath, Expression = "//*[@id=\"J_bottomPage\"]" }, Patterns = new List<string> { @"&page=[0-9]+&" } }); return context; } [Schema("jd", "sku_v2", TableSuffix.Monday)] [EntitySelector(Expression = "$.[*]", Type = SelectorType.JsonPath)] [Indexes(Primary = "sku")] public class ProductUpdater : ISpiderEntity { [StoredAs("sku", DataType.String, 25)] [PropertySelector(Expression = "$.pid", Type = SelectorType.JsonPath)] public string Sku { get; set; } [StoredAs("shopname", DataType.String, 100)] [PropertySelector(Expression = "$.seller", Type = SelectorType.JsonPath)] public string ShopName { get; set; } [StoredAs("shopid", DataType.String, 25)] [PropertySelector(Expression = "$.shopId", Type = SelectorType.JsonPath)] public string ShopId { get; set; } } }