#今日目標 **scrapy之盜墓筆記三級頁面爬取** 今天要爬取的是盜墓筆記小說,由分析該小說的主要內容在三級頁面里,故需要我們 一一解析 *代碼實現* daomu.py ``` import scrapy from ..items import DaomuItem class DaomuSpider(scrapy.Spider): name = 'daomu' allowed_domains = ['daomubiji.com'] start_urls = ['http://www.daomubiji.com/'] # 解析一級頁面的parse函數 def parse(self, response): # link_list: ['http://xxx/dao-mu-bi-ji-1','','',''] link_list = response.xpath('//ul[@class="sub-menu"]/li/a/@href').extract() for link in link_list: # 交給調度器 yield scrapy.Request( url = link, callback = self.parse_two_html ) # 解析二級頁面函數(圈名 章節數 章節名 鏈接) def parse_two_html(self,response): # 基准xpath article_list = response.xpath('//article') for article in article_list: # 創建item對象 item = DaomuItem() # info_list: ['七星魯王','第一章','血屍'] info_list = article.xpath('./a/text()').get().split() if len(info_list) == 3: item['volume_name'] = info_list[0] item['zh_num'] = info_list[1] item['zh_name'] = info_list[2] else: item['volume_name'] = info_list[0] item['zh_name'] = info_list[1] item['zh_num'] = '' # 提取鏈接並發給調度器入隊列 item['zh_link'] = article.xpath('./a/@href').get() yield scrapy.Request( url = item['zh_link'], # meta參數: 傳遞item對象到下一個解析函數 meta = {'item':item}, callback = self.parse_three_html ) # 解析三級頁面(小說內容)函數 def parse_three_html(self,response): # 獲取上一個函數傳遞過來的item對象 item = response.meta['item'] # content_list: ['段落1','段落2','',''] content_list = response.xpath( '//article[@class="article-content"]//p/text()' ).extract() item['zh_content'] = '\n'.join(content_list) yield item ``` items.py ``` import scrapy class DaomuItem(scrapy.Item): # define the fields for your item here like: # name = scrapy.Field() # 卷名 volume_name = scrapy.Field() # 章節數 zh_num = scrapy.Field() # 章節名稱 zh_name = scrapy.Field() # 章節鏈接 zh_link = scrapy.Field() # 小說內容 zh_content = scrapy.Field() ``` pipelines.py ``` class DaomuPipeline(object): def process_item(self, item, spider): filename = '/home/tarena/daomu/{}_{}_{}'.format( item['volume_name'], item['zh_num'], item['zh_name'] ) with open(filename,'w') as f: f.write(item['zh_content']) return item ```
