"""CloseSpider is an extension that forces spiders to be closed after certain
conditions are met.
See documentation in docs/topics/extensions.rst
"""
class CloseSpider(object):
def __init__(self, crawler):
self.crawler = crawler
self.close_on = {
'timeout': crawler.settings.getfloat('CLOSESPIDER_TIMEOUT'),
'itemcount': crawler.settings.getint('CLOSESPIDER_ITEMCOUNT'),
'pagecount': crawler.settings.getint('CLOSESPIDER_PAGECOUNT'),
'errorcount': crawler.settings.getint('CLOSESPIDER_ERRORCOUNT'),
}
if not any(self.close_on.values()):
raise NotConfigured
self.counter = defaultdict(int)
if self.close_on.get('errorcount'):
crawler.signals.connect(self.error_count, signal=signals.spider_error)
if self.close_on.get('pagecount'):
crawler.signals.connect(self.page_count, signal=signals.response_received)
if self.close_on.get('timeout'):
crawler.signals.connect(self.spider_opened, signal=signals.spider_opened)
if self.close_on.get('itemcount'):
crawler.signals.connect(self.item_scraped, signal=signals.item_scraped)
crawler.signals.connect(self.spider_closed, signal=signals.spider_closed)
@classmethod
def from_crawler(cls, crawler):
return cls(crawler)
def error_count(self, failure, response, spider):
self.counter['errorcount'] += 1
if self.counter['errorcount'] == self.close_on['errorcount']:
self.crawler.engine.close_spider(spider, 'closespider_errorcount')
def page_count(self, response, request, spider):
self.counter['pagecount'] += 1
if self.counter['pagecount'] == self.close_on['pagecount']:
self.crawler.engine.close_spider(spider, 'closespider_pagecount')
def spider_opened(self, spider):
self.task = reactor.callLater(self.close_on['timeout'], \
self.crawler.engine.close_spider, spider, \
reason='closespider_timeout')
def item_scraped(self, item, spider):
self.counter['itemcount'] += 1
if self.counter['itemcount'] == self.close_on['itemcount']:
self.crawler.engine.close_spider(spider, 'closespider_itemcount')
def spider_closed(self, spider):
task = getattr(self, 'task', False)
if task and task.active():
task.cancel()
1 上述代碼是一個scrapy 關閉爬蟲的一個的擴展類,從代碼中可以看出主要是實現了timeout, itemcount, pagecount, errorcount 4種方式,因此可以在setting中設置這4種方式,當觸發條件的時候會自動停止爬蟲
在setting中設置 CLOSESPIDER_TIMEOUT # 指定時間退出 CLOSESPIDER_ITEMCOUNT # 生成了指定數量的item CLOSESPIDER_PAGECOUNT # 抓取了指定數量的響應 CLOSESPIDER_ERRORCOUNT # 在發生指定數量的錯誤
# 打開EXTENSIONS擴展
EXTENSIONS = {
'scrapy.extensions.closespider.CloseSpider': 500,
}
2 從CloseSpider類中可以了解到停止爬蟲是使用了 self.crawler.engine.close_spider() 方法,因此當滿足一定條件的時候我們也可以調用這個方法停止scrapy
# 在Spider文件中 self.crawler.engine.close_spider(self, 錯誤信息) # 在middlewares文件中 spider.crawler.engine.close_spider(spider, 錯誤信息)

