middleware文件
# -*- coding: utf-8 -*-
# Define here the models for your spider middleware
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
import random
from scrapy import signals
class TutorialDownloaderMiddleware(object):
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the downloader middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_request(self, request, spider):
# Called for each request that goes through the downloader
# middleware.
# Must either:
# - return None: continue processing this request
# - or return a Response object
# - or return a Request object
# - or raise IgnoreRequest: process_exception() methods of
# installed downloader middleware will be called
return None
def process_response(self, request, response, spider):
# Called with the response returned from the downloader.
# Must either;
# - return a Response object
# - return a Request object
# - or raise IgnoreRequest
return response
def process_exception(self, request, exception, spider):
# Called when a download handler or a process_request()
# (from other downloader middleware) raises an exception.
# Must either:
# - return None: continue processing this exception
# - return a Response object: stops process_exception() chain
# - return a Request object: stops process_exception() chain
pass
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
# 創建一個中間件 ip代理池
from collections import defaultdict
from scrapy.exceptions import NotConfigured
class RandomProxyMiddleware(object):
def __init__(self, settings):
# 第三步 初始化配置和變量
# 在settings中寫一個 PROXIES 列表配置
# 從settings中把代理讀進來(把環境變量讀進來)
self.proxies = settings.getlist("PROXIES")
self.stats = defaultdict(int) # 默認值是0 統計次數
self.max_failed = 3 # 請求最多不超過3次
@classmethod
def from_cralwer(cls, crawler):
# 第一步 創建中間件對象
# 首先獲取配置 HTTPPROXY_ENABLED 看看是否啟用代理,
if not crawler.settings.getbool("HTTPPROXY_ENABLED"): # 如果沒有啟用代理
raise NotConfigured
# auth_encoding = crawler.settings.get("HTTPPROXY_AUTH_ENCODING") # 讀取配置,這里暫時不用
# 第二步
return cls(crawler.settings) # cls()實際調用的是 init()函數,如果init接受參數,cls就需要參數
def process_request(self, request, spider):
# 第四步 為每個request對象隨機分配一個ip代理
# 讓這個請求使用代理 初始url不使用代理ip
if self.proxies and not request.meta.get("proxy") and request.url not in spider.start_urls:
request.meta["proxy"] = random.choice(self.proxies)
def process_response(self, request, response, spider):
# 第五步: 請求成功
cur_proxy = request.meta.get('proxy')
# 判斷是否被對方禁封
if response.status > 400:
# 給相應的ip失敗次數 +1
self.stats[cur_proxy] += 1
print("當前ip{},第{}次出現錯誤狀態碼".format(cur_proxy, self.stats[cur_proxy]))
# 當某個ip的失敗次數累計到一定數量
if self.stats[cur_proxy] >= self.max_failed: # 當前ip失敗超過3次
print("當前狀態碼是{},代理{}可能被封了".format(response.status, cur_proxy))
# 可以認為該ip被對方封了,從代理池中刪除這個ip
self.remove_proxy(cur_proxy)
del request.meta['proxy']
# 將這個請求重新給調度器,重新下載
return request
# 狀態碼正常的時候,正常返回
return response
def process_exception(self, request, exception, spider):
# 第五步:請求失敗
cur_proxy = request.meta.get('proxy') # 取出當前代理
from twisted.internet.error import ConnectionRefusedError, TimeoutError
# 如果本次請求使用了代理,並且網絡請求報錯,認為這個ip出了問題
if cur_proxy and isinstance(exception, (ConnectionRefusedError, TimeoutError)):
print("當前的{}和當前的{}".format(exception, cur_proxy))
self.remove_proxy(cur_proxy)
del request.meta['proxy']
# 重新下載這個請求
return request
def remove_proxy(self, proxy):
if proxy in self.proxies:
self.proxies.remove(proxy)
print("從代理列表中刪除{}".format(proxy))
settings 文件
# Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
DOWNLOADER_MIDDLEWARES = {
'tutorial.middlewares.RandomProxyMiddleware': 749, # 修改下載優先級數字
}