設置代理的位置:下載中間件
一、內置代理(優點:簡單,缺點:只能代理一個ip)
1、源碼分析
process_request(self, request, spider)在下載器執行前執行
if scheme in self.proxies: self._set_proxy(request, scheme)
_set_proxy方法(設置代理)->self.proxies[scheme]->self.proxies = {} (__init__)->getproxies()->getproxies_environment->return proxies
環境變量
os.environ = {
"HTTP_PROXY": "192.169.1.1",
"HTTPS_PROXY": "1.1.1.1",
"HTTPS": '10.10.10.10'
}
通過getproxies_environmen方法的處理
proxies = {
'http': "192.169.1.1",
'https': "1.1.1.1",
}
加()執行
getproxies() = proxies
for type_, url in getproxies().items(): self.proxies[type_] = self._get_proxy(url, type_)
_get_proxy(url, type_) 處理代理用用戶密碼的情況,如下->return creds, proxy_url
getproxies() = {
'http': "http://用戶名:密碼@192.169.1.1:端口",
'https': "1.1.1.1",
}
self.proxies = {
'http': ("basic 用戶名密碼加密后的值", 192.168.1.1:端口),
'https': (None, ip:端口)
}
2、方法一,內置代理
在start_requests(定制起始請求)設置環境變量即可
def start_requests(self): import os os.environ["HTTP_PROXY"] = "http://用戶名:密碼@192.169.1.1:端口" os.environ["HTTPS_PROXY"] = "http://用戶名:密碼@114.110.1.1:端口" for url in self.start_urls: yield Request(url=url, callback=self.parse)
3、方法二,內置代理
根據源碼分析
if 'proxy' in request.meta: if request.meta['proxy'] is None: return # extract credentials if present creds, proxy_url = self._get_proxy(request.meta['proxy'], '') request.meta['proxy'] = proxy_url if creds and not request.headers.get('Proxy-Authorization'): request.headers['Proxy-Authorization'] = b'Basic ' + creds return elif not self.proxies: return
格式,使用meta
def start_requests(self): # import os # os.environ["HTTP_PROXY"] = "http://用戶名:密碼@192.169.1.1:端口" # os.environ["HTTPS_PROXY"] = "http://用戶名:密碼@114.110.1.1:端口" for url in self.start_urls: yield Request(url=url, callback=self.parse, meta={'proxy': "http://用戶名:密碼@192.169.1.1:端口"})
注意:優先執行meta
二、自定義代理(自定義中間件)
import base64 import random from six.moves.urllib.parse import unquote try: from urllib2 import _parse_proxy except ImportError: from urllib.request import _parse_proxy from six.moves.urllib.parse import urlunparse from scrapy.utils.python import to_bytes class MYHTTPProxyMiddleware(object): def _basic_auth_header(self, username, password): user_pass = to_bytes( '%s:%s' % (unquote(username), unquote(password)), encoding='latin-1') return base64.b64encode(user_pass).strip() def process_request(self, request, spider): PROXIES = [ "http://root:@WSX3edc@192.168.1.1:8000/", "http://root:@WSX3edc@192.168.1.2:8000/", "http://root:@WSX3edc@192.168.1.3:8000/", "http://root:@WSX3edc@192.168.1.4:8000/", "http://root:@WSX3edc@192.168.1.5:8000/", ] url = random.choice(PROXIES) orig_type = "" proxy_type, user, password, hostport = _parse_proxy(url) proxy_url = urlunparse((proxy_type or orig_type, hostport, '', '', '', '')) if user: creds = self._basic_auth_header(user, password) else: creds = None request.meta['proxy'] = proxy_url if creds: request.headers['Proxy-Authorization'] = b'Basic ' + creds
修改配置文件(56行)
DOWNLOADER_MIDDLEWARES = { # 'toscrapy.middlewares.ToscrapyDownloaderMiddleware': 543, 'toscrapy.proxy.MYHTTPProxyMiddleware': 543, }
查看默認的下載中間件和中間件的權重值