DotnetSpider (一) 架構的理解、應用、搭建


** 溫馨提示:如需轉載本文,請注明內容出處。**

本文鏈接:http://www.cnblogs.com/grom/p/8931650.html 

 

本文Demo:https://github.com/Grom-Li/DataSpider 適用版本: 2.5.0、2.5.1

 

最近在做爬蟲,之前一直在使用 HttpWebRequest 和 WebClient ,很方便快捷,也很適合新手,但隨着抓取任務的增多,多任務,多庫等情況的出現,使用一個優秀的爬蟲框架是十分必要的。於是開始接觸dotnetspider。

 

  

借鑒一下框架的設計圖,在引入dotnetspider的NuGet包后,我基本也是按照這個進行了分層

  

 

Data.Spider - 存放前台頁面(Winform、控制台)和實體爬蟲(EntitySpider)等,相當於發起請求的起點。

Spider.Downloader - 封裝請求等信息,可實現自定義cookie等,非必須。

Spider.Processor - 處理器,繼承 IPageProcessor 實現對抓取內容的處理

Spider.Pipe - 管道,我將它理解為經過了 Processor 處理后的一個回調,將處理好的數據存儲(文件、數據庫等)

Spider.Entity - 數據實體類,繼承 SpiderEntity

Spider.Command - 一些常用的公用命令,我這目前存放着轉數據格式類,后台執行JS類,SqlHelper(因架構自帶數據庫管道,暫時沒用)等

這樣的分層也是參考了源碼的示例

  

隨着這幾天的嘗試,真的發現這個框架真的非常靈活,以凹凸租車的爬蟲為例,上代碼

  實體類

[EntityTable("CarWinsSpider", "AtzucheCar", EntityTable.Today)]
[EntitySelector(Expression = "$.data.content[*]", Type = SelectorType.JsonPath)]

public class AtzucheModel : SpiderEntity
{
  /// <summary>
  /// 車輛編號
  /// </summary>
  [PropertyDefine(Expression = "$.carNo", Type = SelectorType.JsonPath)]
  public int carNo { get; set; }
  /// <summary>
  /// 品牌
  /// </summary>
  //[ReplaceFormatter(NewValue = "", OldValue = "\r")]
  //[ReplaceFormatter(NewValue = "", OldValue = "\t")]
  //[ReplaceFormatter(NewValue = "", OldValue = "&nbsp;")]
  //[ReplaceFormatter(NewValue = "", OldValue = "\n")]
  //[ReplaceFormatter(NewValue = "", OldValue = "\"")]
  //[ReplaceFormatter(NewValue = "", OldValue = " ")]
  [PropertyDefine(Expression = "$.brand", Type = SelectorType.JsonPath)]
  public string brand { get; set; }
  /// <summary>
  /// 地址
  /// </summary>
  [PropertyDefine(Expression = "$.carAddr", Type = SelectorType.JsonPath)]
  public string carAddr { get; set; }
  /// <summary>
  /// 車系
  /// </summary>
  [PropertyDefine(Expression = "$.type", Type = SelectorType.JsonPath)]
  public string type { get; set; }
  /// <summary>
  /// 排量
  /// </summary>
  [PropertyDefine(Expression = "$.sweptVolum", Type = SelectorType.JsonPath)]
  public string sweptVolum { get; set; }
  /// <summary>
  /// 圖片
  /// </summary>
  [PropertyDefine(Expression = "$.coverPic", Type = SelectorType.JsonPath)]
  public string coverPic { get; set; }
  /// <summary>
  /// 日租金
  /// </summary>
  [PropertyDefine(Expression = "$.dayPrice", Type = SelectorType.JsonPath)]
  public int dayPrice { get; set; }
  /// <summary>
  /// 公里數
  /// </summary>
  [PropertyDefine(Expression = "$.distance", Type = SelectorType.JsonPath)]
  public string distance { get; set; }
  /// <summary>
  /// 評分
  /// </summary>
  [PropertyDefine(Expression = "$.evalScore", Type = SelectorType.JsonPath)]
  public string evalScore { get; set; }
  [PropertyDefine(Expression = "$.gbType", Type = SelectorType.JsonPath)]
  public string gbType { get; set; }
  /// <summary>
  /// 車牌
  /// </summary>
  [PropertyDefine(Expression = "$.plateNum", Type = SelectorType.JsonPath)]
  public string plateNum { get; set; }
  [PropertyDefine(Expression = "$.replyTag", Type = SelectorType.JsonPath)]
  public string replyTag { get; set; }
  [PropertyDefine(Expression = "$.transCount", Type = SelectorType.JsonPath)]
  public string transCount { get; set; }
  /// <summary>
  /// 年款
  /// </summary>
  [PropertyDefine(Expression = "$.year", Type = SelectorType.JsonPath)]
  public int year { get; set; }
  [PropertyDefine(Expression = "$.isPrivilege", Type = SelectorType.JsonPath)]
  public int isPrivilege { get; set; }
  [PropertyDefine(Expression = "$.isRecommend", Type = SelectorType.JsonPath)]
  public int isRecommend { get; set; }
  [PropertyDefine(Expression = "$.isUpgrade", Type = SelectorType.JsonPath)]
  public int isUpgrade { get; set; }
  [PropertyDefine(Expression = "$.lat", Type = SelectorType.JsonPath)]
  public string lat { get; set; }
  [PropertyDefine(Expression = "$.lon", Type = SelectorType.JsonPath)]
  public string lon { get; set; }
  [PropertyDefine(Expression = "$.queryId", Type = SelectorType.JsonPath)]
  public string queryId { get; set; }
  [PropertyDefine(Expression = "$.supplyCarService", Type = SelectorType.JsonPath)]
  public int supplyCarService { get; set; }
  [PropertyDefine(Expression = "$.freeCarService", Type = SelectorType.JsonPath)]
  public int freeCarService { get; set; }
  [PropertyDefine(Expression = "$.isShenMaCar", Type = SelectorType.JsonPath)]
  public int isShenMaCar { get; set; }
  [PropertyDefine(Expression = "$.supportGetReturn", Type = SelectorType.JsonPath)]
  public int supportGetReturn { get; set; }
  [PropertyDefine(Expression = "$.confirmation", Type = SelectorType.JsonPath)]
  public int confirmation { get; set; }
}

 

起始:

  

/// <summary>
/// 應用程序的主入口點。
/// </summary>
[STAThread]
static void Main()
{
  var site = new Site
  {
    CycleRetryTimes = 1,
    SleepTime = 200,
    Headers = new Dictionary<string, string>()
    {
      {"Accept","application/json, text/javascript, */*; q=0.01" },
      {"Accept-Encoding","gzip, deflate" },
      {"gzip, deflate","zh-CN,zh;q=0.9" },
      {"X-Requested-With","XMLHttpRequest" },
      { "Referer", "http://www.atzuche.com/hz/car/search"},
      { "Connection","keep-alive" },
      { "Content-Type","application/json;charset=UTF-8" },
      { "Host","www.atzuche.com"},
      { "User-Agent","Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36"}
    }
  };
  List<Request> resList = new List<Request>();

  Request res = new Request();
  //res.PostBody = $"id=7&j=%7B%22createMan%22%3A%2218273159100%22%2C%22createTime%22%3A1518433690000%2C%22row%22%3A5%2C%22siteUserActivityListId%22%3A8553%2C%22siteUserPageRowModuleId%22%3A84959%2C%22topids%22%3A%22%22%2C%22wherePhase%22%3A%221%22%2C%22wherePreferential%22%3A%220%22%2C%22whereUsertype%22%3A%220%22%7D&page={i}&shopid=83106681";//據說是post請求需要
  res.Url = "http://www.atzuche.com/car/searchListMap/2?cityCode=330100&sceneCode=U002&filterCondition%5Blon%5D=120.219294&filterCondition%5Blat%5D=30.259258&filterCondition%5Bseq%5D=4&pageNum=1&pageSize=0";
  res.Method = System.Net.Http.HttpMethod.Get;

  resList.Add(res);

  var spider = DotnetSpider.Core.Spider.Create(site, new QueueDuplicateRemovedScheduler(), new AtzucheProcessor())
    .AddStartRequests(resList.ToArray())//頁面抓取整理
    .AddPipeline(new AtzuchePipe());//數據回調

  //----------------------------------
  spider.Monitor = new DotnetSpider.Core.Monitor.NLogMonitor();
  spider.Downloader = new AtzucheDownloader(); //new DotnetSpider.Core.Downloader.HttpClientDownloader();
  spider.ClearSchedulerAfterComplete = false;//爬蟲結束后不取消調度器
  //----------------------------------

  spider.ThreadNum = 1;
  spider.Run();

  Console.WriteLine("Press any key to continue...");
  Console.Read();

}

 

這里也可將整個抓取方法當做一個Spider實例單獨放置 -> EntitySpider

 

  

/// <summary>
/// 應用程序的主入口點。
/// </summary>
[STAThread]
static void Main()
{
  AtzucheEntitySpider dDengEntitySpider = new AtzucheEntitySpider();
  dDengEntitySpider.AddPageProcessor(new AtzucheProcessor());//控制器
  dDengEntitySpider.AddPipeline(new AtzuchePipe());//回調
  dDengEntitySpider.ThreadNum = 1;
  dDengEntitySpider.Run();
  Console.WriteLine("Press any key to continue...");
  Console.Read();
}

 

Downloader

對目標的請求全部包含着這里,可以根據需要自行設置,下篇將進行自定義Request的應用

public class AtzucheDownloader : BaseDownloader
{
  protected override Page DowloadContent(Request request, ISpider spider)
  { 
    return new HttpClientDownloader().Download(request, spider);
  }
  
//v2.5.0+版本   protected override Task<Page> DowloadContent(Request request, ISpider spider)   {     return new HttpClientDownloader().Download(request, spider);   }
}

 

 

新建爬蟲實體類 (如在Main里寫入實體爬蟲,此方法可省略)

public class AtzucheEntitySpider : EntitySpider
{
  protected override void MyInit(params string[] arguments)
  {
    AddPipeline(new SqlServerEntityPipeline("Server=.;Database=AuzucheSpider;uid=sa;pwd=123;MultipleActiveResultSets=true"));//注意連接字符串中數據庫不能帶 .  親測報錯。。。
    AddStartUrl("http://www.atzuche.com/car/searchListMap/2?cityCode=330100&sceneCode=U002&filterCondition%5Blon%5D=120.219294&filterCondition%5Blat%5D=30.259258&filterCondition%5Bseq%5D=4&pageNum=1&pageSize=0");
    AddEntityType<AtzucheModel>();//如添加此實體類,框架將會根據此實體類上面的特性選擇進行匹配,匹配成功后插入數據庫,固可以省略Processor和Pipe,或者不使用此句,通過控制器和回調自定義存儲方法
  }

public AtzucheEntitySpider() : base("AuzucheSpider", new Site
{
  CycleRetryTimes = 1,
  SleepTime = 200,
  Headers = new Dictionary<string, string>()
  {
    {"Accept","application/json, text/javascript, */*; q=0.01" },
    {"Accept-Encoding","gzip, deflate" },
    {"gzip, deflate","zh-CN,zh;q=0.9" },
    {"X-Requested-With","XMLHttpRequest" },
    { "Referer", "http://www.atzuche.com/hz/car/search"},
    { "Connection","keep-alive" },
    { "Content-Type","application/json;charset=UTF-8" },
    { "Host","www.atzuche.com"},
    { "User-Agent","Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36"}
  }
  })
  {
  }

}

 

接下來是處理器:

解析抓取的數據封裝到"AtzucheList"內,可Pipe內通過此名稱獲取處理好的數據。

public class AtzucheProcessor : IPageProcessor
{
  public void Process(Page page, ISpider spider)
  {
    List<AtzucheModel> list = new List<AtzucheModel>();
    var html = page.Selectable.JsonPath("$.data.content").GetValue();
    list = JsonConvert.DeserializeObject<List<AtzucheModel>>(html);
    page.AddResultItem("AtzucheList", list);
  }
}

 

最后是回調,可在此加入保存數據的代碼,至此結束。

public class AtzuchePipe : BasePipeline
{
  public override void Process(IEnumerable<ResultItems> resultItems, ISpider spider)
  {
    var result = new List<AtzucheModel>();
    foreach (var resultItem in resultItems)
    {
      Console.WriteLine((resultItem.Results["AtzucheList"] as List<AtzucheModel>).Count);
      foreach (var item in (resultItem.Results["AtzucheList"] as List<AtzucheModel>))
      {
        result.Add(new AtzucheModel()
        {
          carNo = item.carNo
        });
        Console.WriteLine($"{item.carNo}:{item.type} ");
      }
    }
  }
}

 

   結果圖:

 

總體來說,此框架對新手還是很友好的,靈活寫法可以讓我們有較多的方式去實現爬蟲,因為這個爬蟲比較簡單,就先寫到這里,未來如果可能,會再嘗試使用框架內的多線程、代理等功能,如有心得將繼續分享,希望能對跟我一樣的新手有所幫助,十分感謝。

 


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM