import scrapy
from scrapy.spidermiddlewares.httperror import HttpError
from twisted.internet.error import DNSLookupError
from twisted.internet.error import TimeoutError, TCPTimedOutError
class ErrbackSpider(scrapy.Spider):
name = "errback_example"
start_urls = [
"http://www.httpbin.org/", # 正常HTTP 200返回
"http://www.httpbin.org/status/404", # 404 Not found error
"http://www.httpbin.org/status/500", # 500服務器錯誤
"http://www.httpbin.org:12345/", # 超時無響應錯誤
"http://www.httphttpbinbin.org/", # DNS 錯誤
]
def start_requests(self):
for u in self.start_urls:
yield scrapy.Request(u, callback=self.parse_httpbin,
errback=self.errback_httpbin,
dont_filter=True)
def parse_httpbin(self, response):
self.logger.info('Got successful response from {}'.format(response.url))
# 其他處理.
def errback_httpbin(self, failure):
# 日志記錄所有的異常信息
self.logger.error(repr(failure))
# 假設我們需要對指定的異常類型做處理,
# 我們需要判斷異常的類型
if failure.check(HttpError):
# HttpError由HttpErrorMiddleware中間件拋出
# 可以接收到非200 狀態碼的Response
response = failure.value.response
self.logger.error('HttpError on %s', response.url)
elif failure.check(DNSLookupError):
# 此異常由請求Request拋出
request = failure.request
self.logger.error('DNSLookupError on %s', request.url)
elif failure.check(TimeoutError, TCPTimedOutError):
request = failure.request
self.logger.error('TimeoutError on %s', request.url)