去重
內置去重
scrapy默認會對url進行去重,使用的去重類是from scrapy.dupefilter import RFPDupeFilter
,看一下源碼流程
因為'http://www.baidu.com?k1=1&k2=2'和'http://www.baidu.com?k2=2&k1=1'應該是同一個請求,但是如果單純地把url或者url的md5值放到集合中肯定是有問題的,我們使用內置的request_fingerprint方法就可以解決這個問題
自定義去重
dupfilter.py
import scrapy
from scrapy.dupefilter import BaseDupeFilter
from scrapy.utils.request import request_fingerprint
class MyFilter(BaseDupeFilter):
def __init__(self):
self.visited = set()
@classmethod
def from_settings(cls, settings):
return cls()
def request_seen(self, request):
fd = request_fingerprint(request=request)
if fd not in self.visited:
return True
self.visited.add(fd)
def open(self): # can return deferred
print('starting')
def close(self, reason): # can return a deferred
print('ending')
def log(self, request, spider): # log that a request has been filtered
pass
settings.py
DUPEFILTER_CLASS = 'nj.dupfilter.MyFilter'
深度
scrapy的深度控制是通過一個爬蟲中間件來控制的from scrapy.spidermiddlewares.depth import DepthMiddleware
,如果請求的深度不符合設定,那么請求就不會發給引擎,也就到不了調度器,看一下源碼
下載中間件
#!/usr/bin/env python
# -*- coding:utf-8 -*-
from scrapy.http import HtmlResponse
from scrapy.http import Request
class Md1(object):
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
return s
def process_request(self, request, spider):
# Called for each request that goes through the downloader
# middleware.
# Must either:
# - return None: continue processing this request 繼續后續中間件去下載
# - or return a Response object 停止process_request的執行,開始執行process_response
# - or return a Request object 停止中間件的執行,將Request重新調度器
# - or raise IgnoreRequest: process_exception() methods of installed downloader middleware will be called 停止process_request的執行,開始執行process_exception(從最后的中間件開始)
print('m1.process_request',request)
# 1. 返回Response,在這個返回response,那么就不會真正去下載,而且process_response從下一個中間件開始,這一點和django1.9之前的版本類似
# import requests
# result = requests.get(request.url)
# return HtmlResponse(url=request.url, status=200, headers=None, body=result.content)
# 2. 返回Request, 返回request表示你讓我發的這個請求我不想發,我拋出一個請求放到調度器,這時候起點就又到了調度器了,中間件又要從頭開始走了
# return Request('https://www.cnblogs.com/longyunfeigu/p/9485291.html')
# 3. 拋出異常, 請求就不再往下走了,到此為止
# from scrapy.exceptions import IgnoreRequest
# raise IgnoreRequest
# 4. 對請求進行加工(*),這是經常使用的功能
# request.headers['user-agent'] = "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36"
raise AttributeError('hhh')
def process_response(self, request, response, spider):
# Called with the response returned from the downloader.
# Must either;
# - return a Response object 轉交給其他中間件process_response
# - return a Request object 按道理說,如果請求不合適,想要重新拋出一個請求應該在process_request做,但是這里也同樣支持,這時候起點就又到了調度器
# - or raise IgnoreRequest 調用Request.errback
print('m1.process_response',request,response)
return response
def process_exception(self, request, exception, spider):
# Called when a download handler or a process_request()
# (from other downloader middleware) raises an exception.
# Must either:
# - return None: continue processing this exception
# - return a Response object: stops process_exception() chain
# - return a Request object: stops process_exception() chain
print('m1.process_exception')
class Md2(object):
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
return s
def process_request(self, request, spider):
# Called for each request that goes through the downloader
# middleware.
# Must either:
# - return None: continue processing this request
# - or return a Response object
# - or return a Request object
# - or raise IgnoreRequest: process_exception() methods of
# installed downloader middleware will be called
print('md2.process_request',request)
def process_response(self, request, response, spider):
# Called with the response returned from the downloader.
# Must either;
# - return a Response object
# - return a Request object
# - or raise IgnoreRequest
print('m2.process_response', request,response)
return response
def process_exception(self, request, exception, spider):
# Called when a download handler or a process_request()
# (from other downloader middleware) raises an exception.
# Must either:
# - return None: continue processing this exception 繼續交給后續中間件處理異常
# - return a Response object: stops process_exception() chain 停止后續process_exception方法, 從最后一層中間件開始執行process_response
# - return a Request object: stops process_exception() chain 停止中間件,request將會被重新調用下載
# from scrapy.exceptions import IgnoreRequest
# # 異常被忽略,此時process_response 也不會被執行了
# raise IgnoreRequest
# print('m2.process_exception')
# return HtmlResponse(url=request.url, status=200, headers=None, body=b'xx')
內置代理
通過源碼可以看出幾點:
- os.environ當作一個字典設置代理信息,但是格式需要注意,而且os.environ得到的是當前進程的環境變量設置的值,其他進程是拿不到這個進程在os.environ設置的值的
- 在request的meta里也能定義proxy,而且通過源碼知道優先級比os.environ更高
自定義代理
根據源碼修改
import base64
import random
from six.moves.urllib.parse import unquote
try:
from urllib2 import _parse_proxy
except ImportError:
from urllib.request import _parse_proxy
from six.moves.urllib.parse import urlunparse
from scrapy.utils.python import to_bytes
class XdbProxyMiddleware(object):
def _basic_auth_header(self, username, password):
user_pass = to_bytes(
'%s:%s' % (unquote(username), unquote(password)),
encoding='latin-1')
return base64.b64encode(user_pass).strip()
def process_request(self, request, spider):
PROXIES = [
"http://root:123456@192.168.11.11:9999/",
"http://root:123456@192.168.11.12:9999/",
"http://root:123456@192.168.11.13:9999/",
"http://root:123456@192.168.11.14:9999/",
"http://root:123456@192.168.11.15:9999/",
"http://root:123456@192.168.11.16:9999/",
]
url = random.choice(PROXIES)
orig_type = ""
proxy_type, user, password, hostport = _parse_proxy(url)
proxy_url = urlunparse((proxy_type or orig_type, hostport, '', '', '', ''))
if user:
creds = self._basic_auth_header(user, password)
else:
creds = None
request.meta['proxy'] = proxy_url
if creds:
request.headers['Proxy-Authorization'] = b'Basic ' + creds
或者
from scrapy.utils.python import to_bytes
class DdbProxyMiddleware(object):
def process_request(self, request, spider):
PROXIES = [
{'ip_port': '111.11.228.75:80', 'user_pass': 'root@123456'},
{'ip_port': '120.198.243.22:80', 'user_pass': 'root@123456'},
{'ip_port': '111.8.60.9:8123', 'user_pass': 'root@123456'},
{'ip_port': '101.71.27.120:80', 'user_pass': 'root@123456'},
{'ip_port': '122.96.59.104:80', 'user_pass': 'root@123456'},
{'ip_port': '122.224.249.122:8088', 'user_pass': 'root@123456'},
]
proxy = random.choice(PROXIES)
if proxy['user_pass'] is not None:
request.meta['proxy'] = to_bytes("http://%s" % proxy['ip_port'])
encoded_user_pass = base64.b64encode(to_bytes(proxy['user_pass']))
request.headers['Proxy-Authorization'] = to_bytes('Basic ' + encoded_user_pass)
else:
request.meta['proxy'] = to_bytes("http://%s" % proxy['ip_port'])
爬蟲中間件
class SpiderMiddleware(object):
def process_spider_input(self,response, spider):
"""
下載完成,執行,然后交給parse處理
:param response:
:param spider:
:return:
"""
pass
def process_spider_output(self,response, result, spider):
"""
spider處理完成,返回時調用
:param response:
:param result:
:param spider:
:return: 必須返回包含 Request 或 Item 對象的可迭代對象(iterable)
"""
return result
def process_spider_exception(self,response, exception, spider):
"""
異常調用
:param response:
:param exception:
:param spider:
:return: None,繼續交給后續中間件處理異常;含 Response 或 Item 的可迭代對象(iterable),交給調度器或pipeline
"""
return None
def process_start_requests(self,start_requests, spider):
"""
爬蟲啟動時調用
:param start_requests:
:param spider:
:return: 包含 Request 對象的可迭代對象
"""
return start_requests
自定義命令
單個爬蟲
from scrapy.cmdline import execute
if __name__ == '__main__':
execute(["scrapy","crawl", "cnblog","--nolog"])
單進程多個爬蟲同時啟動
- 在spiders同級創建任意目錄,如:commands
- 在其中創建 crawlall.py 文件 (此處文件名就是自定義的命令)
from scrapy.commands import ScrapyCommand from scrapy.utils.project import get_project_settings class Command(ScrapyCommand): requires_project = True def syntax(self): return '[options]' def short_desc(self): return 'Runs all of the spiders' def run(self, args, opts): spider_list = self.crawler_process.spiders.list() for name in spider_list: self.crawler_process.crawl(name, **opts.__dict__) self.crawler_process.start()
- 在settings.py 中添加配置 COMMANDS_MODULE = '項目名稱.目錄名稱'
- 在項目目錄執行命令:scrapy crawlall