源碼
class HttpErrorMiddleware(object):
@classmethod
def from_crawler(cls, crawler):
return cls(crawler.settings)
def __init__(self, settings):
self.handle_httpstatus_all = settings.getbool('HTTPERROR_ALLOW_ALL')
self.handle_httpstatus_list = settings.getlist('HTTPERROR_ALLOWED_CODES')
def process_spider_input(self, response, spider):
if 200 <= response.status < 300: # common case
return
meta = response.meta
if 'handle_httpstatus_all' in meta:
return
if 'handle_httpstatus_list' in meta:
allowed_statuses = meta['handle_httpstatus_list']
elif self.handle_httpstatus_all:
return
else:
allowed_statuses = getattr(spider, 'handle_httpstatus_list', self.handle_httpstatus_list)
if response.status in allowed_statuses:
return
raise HttpError(response, 'Ignoring non-200 response')
def process_spider_exception(self, response, exception, spider):
if isinstance(exception, HttpError):
spider.crawler.stats.inc_value('httperror/response_ignored_count')
spider.crawler.stats.inc_value(
'httperror/response_ignored_status_count/%s' % response.status
)
logger.info(
"Ignoring response %(response)r: HTTP status code is not handled or not allowed",
{'response': response}, extra={'spider': spider},
)
return []
通過源碼 init函數可以看到可以配置兩個配置
HTTPERROR_ALLOW_ALL = true
HTTPERROR_ALLOWED_CODES=[301,404]
第一個配置是否允許所有,就是收到響應后,不管什么狀態碼都返回給爬蟲
第二個是允許的列表
以上是全局配置 不推薦
如果想在每個爬蟲里面進行配置
可以在單獨的爬蟲里面設置
handle_httpstatus_all = true
handle_httpstatus_list = [404,302]
除非你非常熟悉你的網站和scrapy 不建議使用這些配置 ,因為把錯誤的響應也返回給爬蟲,沒什么用
