scrapy-redis去重應用
1 # -*- coding: utf-8 -*- 2 import scrapy 3 from scrapy.http import Request 4 5 6 class ChoutiSpider(scrapy.Spider): 7 name = 'chouti' 8 allowed_domains = ['chouti.com'] 9 start_urls = ['http://www.chouti.com/'] 10 11 def start_requests(self): 12 url = "http://dig.chouti.com/" 13 yield Request(url=url, callback=self.parse) 14 15 def parse(self, response): 16 print('response', response)
自定義中間件,過濾重復URL的爬蟲,並且保存redis中
1 #!/usr/bin/env python 2 # -*- coding:utf-8 -*- 3 4 import time 5 from scrapy.dupefilters import BaseDupeFilter 6 from scrapy.utils.request import request_fingerprint 7 import redis 8 from scrapy_redis.dupefilter import RFPDupeFilter 9 from scrapy_redis.connection import get_redis_from_settings 10 from scrapy_redis import defaults 11 12 13 class DupeFilter(BaseDupeFilter): 14 def __init__(self): 15 self.conn = redis.Redis(host='192.168.1.13', port=3306, password='woshinidaye') 16 17 def request_seen(self, request): 18 fd = request_fingerprint(request) 19 result = self.conn.sadd('visited_urls', fd) 20 if result == 1: 21 return False 22 return True 23 24 25 class RedisDupeFilter(RFPDupeFilter): 26 """ 27 改下源碼當中存入redis的key值,它源碼里邊是默認是存的時間戳作為key 28 """ 29 30 @classmethod 31 def from_settings(cls, settings): 32 """Returns an instance from given settings. 33 34 This uses by default the key ``dupefilter:<timestamp>``. When using the 35 ``scrapy_redis.scheduler.Scheduler`` class, this method is not used as 36 it needs to pass the spider name in the key. 37 38 Parameters 39 ---------- 40 settings : scrapy.settings.Settings 41 42 Returns 43 ------- 44 RFPDupeFilter 45 A RFPDupeFilter instance. 46 47 48 """ 49 server = get_redis_from_settings(settings) 50 # XXX: This creates one-time key. needed to support to use this 51 # class as standalone dupefilter with scrapy's default scheduler 52 # if scrapy passes spider on open() method this wouldn't be needed 53 # TODO: Use SCRAPY_JOB env as default and fallback to timestamp. 54 # key = defaults.DUPEFILTER_KEY % {'timestamp': int(time.time())} 時間戳經常變不好取以后我直接定死 55 key = defaults.DUPEFILTER_KEY % {'timestamp': 'woshinidie'} 56 debug = settings.getbool('DUPEFILTER_DEBUG') 57 return cls(server, key=key, debug=debug)
配置文件
1 # redis去重配置 2 REDIS_HOST = '192.168.1.13' # 主機名 3 REDIS_PORT = 3306 # 端口 4 REDIS_PARAMS = {'password': 'woshinidaye'} # Redis連接參數 默認:REDIS_PARAMS = {'socket_timeout': 30,'socket_connect_timeout': 30,'retry_on_timeout': True,'encoding': REDIS_ENCODING,}) 5 # REDIS_PARAMS['redis_cls'] = 'myproject.RedisClient' # 指定連接Redis的Python模塊 默認:redis.StrictRedis 6 REDIS_ENCODING = "utf-8" # redis編碼類型 默認:'utf-8' 7 8 # REDIS_URL = 'redis://user:pass@hostname:9001' # 連接URL(優先於以上配置)源碼可以看到 9 DUPEFILTER_KEY = 'dupefilter:%(timestamp)s' 10 # 純源生的它內部默認是用的以時間戳作為key 11 # DUPEFILTER_CLASS = 'scrapy_redis.dupefilter.RFPDupeFilter' 12 # 我自定義在源碼之上改了保存在redis中的key配置 13 DUPEFILTER_CLASS = 'redisdepth.xxx.RedisDupeFilter' 14 # 自定義redis去重配置 15 # DUPEFILTER_CLASS = 'redisdepth.xxx.DupeFilter'
Scrapy-redis的隊列
包括:先進先出隊列,后進先出隊列,優先隊列
1.先進先出隊列
1 #!/usr/bin/env python 2 # -*- coding:utf-8 -*- 3 4 import redis 5 6 7 class FifoQueue(object): 8 def __init__(self): 9 self.server = redis.Redis(host='192.168.1.13', port=3306, password='woshinidaye') 10 11 def push(self, request): 12 """Push a request""" 13 self.server.lpush('User', request) 14 15 def pop(self): 16 """Pop a request""" 17 data = self.server.rpop('User') 18 return data 19 20 21 q = FifoQueue() 22 q.push(11) 23 q.push(22) 24 q.push(33) 25 print(q.pop()) 26 # 先進先出隊列
2.后進先出隊列
1 #!/usr/bin/env python 2 # -*- coding:utf-8 -*- 3 4 import redis 5 6 7 class LifoQueue(object): 8 9 def __init__(self): 10 self.server = redis.Redis(host='192.168.1.13', port=3306, password='woshinidaye') 11 12 def push(self, request): 13 """Push a request""" 14 self.server.lpush('User', request) 15 16 def pop(self, timeout=0): 17 """Pop a request""" 18 data = self.server.lpop('User') 19 return data
3.優先隊列
1 #!/usr/bin/env python 2 # -*- coding:utf-8 -*- 3 4 import redis 5 6 7 class PriorityQueue(object): 8 """Per-spider priority queue abstraction using redis' sorted set""" 9 10 def __init__(self): 11 self.server = redis.Redis(host='192.168.1.13', port=3306, password='woshinidaye') 12 13 def push(self, request, score): 14 """Push a request""" 15 score = -request.priority 16 # We don't use zadd method as the order of arguments change depending on 17 # whether the class is Redis or StrictRedis, and the option of using 18 # kwargs only accepts strings, not bytes. 19 self.server.execute_command('ZADD', 'xxxx', score, request) 20 21 def pop(self, timeout=0): 22 """ 23 Pop a request 24 timeout not support in this queue class 25 """ 26 # use atomic range/remove using multi/exec 27 pipe = self.server.pipeline() 28 pipe.multi() 29 pipe.zrange('xxxx', 0, 0).zremrangebyrank('xxxx', 0, 0) 30 results, count = pipe.execute() 31 if results: 32 return results[0] 33 34 35 q = PriorityQueue() 36 37 q.push('ZH', -99) 38 q.push('SB', -66) 39 q.push('JJ', -33) 40 # 如果優先級從小到大廣度優先,從大到小就深度優先 41 print(q.pop()) # 默認取最小的 42 print(q.pop()) 43 print(q.pop())
Scheduler源碼分析(我在Notepad++寫了直接貼過來的)
1 1.找到from scrapy_redis.scheduler import Scheduler 2 -執行Scheduler.from_crawler 3 -執行Scheduler.from_settings 4 - 讀取配置文件: 5 SCHEDULER_PERSIST # 是否在關閉時候保留原來的調度器和去重記錄,True=保留,False=清空 6 SCHEDULER_FLUSH_ON_START # 是否在開始之前清空 調度器和去重記錄,True=清空,False=不清空 7 SCHEDULER_IDLE_BEFORE_CLOSE # 去調度器中獲取數據時,如果為空,最多等待時間(最后沒數據,未獲取到) 8 - 讀取配置文件: 9 SCHEDULER_QUEUE_KEY # 調度器中請求存放在redis中的key 10 SCHEDULER_QUEUE_CLASS # 這里可以選擇三種先進先出、后進先出、優先級,默認使用優先級隊列(默認),其他:PriorityQueue(有序集合),FifoQueue(列表)、LifoQueue(列表) 11 SCHEDULER_DUPEFILTER_KEY # 去重規則,在redis中保存時對應的key 12 DUPEFILTER_CLASS # 這里有兩種選擇使用默認或者自己定義的 13 # 內置比如:DUPEFILTER_CLASS = 'scrapy_redis.dupefilter.RFPDupeFilter' 14 # 自定義的比如:DUPEFILTER_CLASS = 'redisdepth.xxx.DupeFilter' 這個優先級別高 在源碼里邊是先判斷然后再后續操作 15 SCHEDULER_SERIALIZER # 對保存到redis中的數據進行序列化,默認使用pickle 16 - 讀取配置文件:redis-server 17 # 源碼在connection.py中17行 18 REDIS_HOST = '192.168.1.13' # 主機名 19 REDIS_PORT = 3306 # 端口 20 REDIS_PARAMS = {'password': 'woshinidaye'} # Redis連接參數 默認:REDIS_PARAMS = {'socket_timeout': 30,'socket_connect_timeout': 30,'retry_on_timeout': True,'encoding': REDIS_ENCODING,}) 21 # REDIS_PARAMS['redis_cls'] = 'myproject.RedisClient' # 指定連接Redis的Python模塊 默認:redis.StrictRedis 22 REDIS_ENCODING = "utf-8" # redis編碼類型 默認:'utf-8' 23 # REDIS_URL = 'redis://user:pass@hostname:9001' # 連接URL(優先於以上配置)源碼可以看到 24 2.爬蟲開始執行起始URL 25 - 調用Scheduler.enqueue_request 26 def enqueue_request(self, request): 27 # 請求需要過濾?並且 去重規則是否已經有?(是否已經訪問,如果未訪問添加到去重記錄)request_seen去重規則重要的一個方法 28 if not request.dont_filter and self.df.request_seen(request): 29 self.df.log(request, self.spider) 30 # 已經訪問過不再進行訪問 31 return False 32 if self.stats: 33 self.stats.inc_value('scheduler/enqueued/redis', spider=self.spider) 34 # 未訪問過,添加到調度器中把這個請求 35 self.queue.push(request) 36 return True 37 3.下載器去調度中獲取任務,去執行任務下載 38 - 調用Scheduler.next_request 39 def next_request(self): 40 block_pop_timeout = self.idle_before_close 41 # 把任務取出來 42 request = self.queue.pop(block_pop_timeout) 43 if request and self.stats: 44 # 此時下載 45 self.stats.inc_value('scheduler/dequeued/redis', spider=self.spider) 46 return request 47 48
settings需要的配置
1 # redis去重配置 2 REDIS_HOST = '192.168.1.13' # 主機名 3 REDIS_PORT = 3306 # 端口 4 REDIS_PARAMS = {'password': 'woshinidaye'} # Redis連接參數 默認:REDIS_PARAMS = {'socket_timeout': 30,'socket_connect_timeout': 30,'retry_on_timeout': True,'encoding': REDIS_ENCODING,}) 5 # REDIS_PARAMS['redis_cls'] = 'myproject.RedisClient' # 指定連接Redis的Python模塊 默認:redis.StrictRedis 6 REDIS_ENCODING = "utf-8" # redis編碼類型 默認:'utf-8' 7 8 # REDIS_URL = 'redis://user:pass@hostname:9001' # 連接URL(優先於以上配置)源碼可以看到 9 DUPEFILTER_KEY = 'dupefilter:%(timestamp)s' 10 # 純源生的它內部默認是用的以時間戳作為key 11 # DUPEFILTER_CLASS = 'scrapy_redis.dupefilter.RFPDupeFilter' 12 # 我自定義在源碼之上改了保存在redis中的key配置 13 DUPEFILTER_CLASS = 'redisdepth.xxx.RedisDupeFilter' 14 # 自定義redis去重配置 15 # DUPEFILTER_CLASS = 'redisdepth.xxx.DupeFilter' 16 17 18 # #############調度器配置########################### 19 # from scrapy_redis.scheduler import Scheduler 20 21 SCHEDULER = "scrapy_redis.scheduler.Scheduler" 22 DEPTH_PRIORITY = 1 # 廣度優先 23 # DEPTH_PRIORITY = -1 # 深度優先 24 SCHEDULER_QUEUE_CLASS = 'scrapy_redis.queue.PriorityQueue' # 默認使用優先級隊列(默認),其他:PriorityQueue(有序集合),FifoQueue(列表)、LifoQueue(列表) 25 # 廣度優先 26 # SCHEDULER_QUEUE_CLASS = 'scrapy_redis.queue.FifoQueue' 27 # 深度優先 28 # SCHEDULER_QUEUE_CLASS = 'scrapy_redis.queue.LifoQueue' 29 30 SCHEDULER_QUEUE_KEY = '%(spider)s:requests' # 調度器中請求存放在redis中的key 31 SCHEDULER_SERIALIZER = "scrapy_redis.picklecompat" # 對保存到redis中的數據進行序列化,默認使用pickle 32 SCHEDULER_PERSIST = True # 是否在關閉時候保留原來的調度器和去重記錄,True=保留,False=清空 33 SCHEDULER_FLUSH_ON_START = True # 是否在開始之前清空 調度器和去重記錄,True=清空,False=不清空 34 SCHEDULER_IDLE_BEFORE_CLOSE = 10 # 去調度器中獲取數據時,如果為空,最多等待時間(最后沒數據,未獲取到)。 35 SCHEDULER_DUPEFILTER_KEY = '%(spider)s:dupefilter' # 去重規則,在redis中保存時對應的key 36 SCHEDULER_DUPEFILTER_CLASS = 'scrapy_redis.dupefilter.RFPDupeFilter' # 去重規則對應處理的類
總結:
深度優先:基於層級先進入到最深層級進行處理全部后再往上層級處理
廣度優先:基於從第一層開始,每個層次處理之后進入下一層級處理
先進先出,廣度優先 FifoQueue
后進先出,深度優先 LifoQueue
優先級隊列:
DEPTH_PRIORITY = 1 # 廣度優先
DEPTH_PRIORITY = -1 # 深度優先
調度器 隊列 DupeFilter三者關系
調度器:獲取哪個request
隊列: 存放request
DupeFilter:對訪問記錄處理
補充點點
定義持久化,爬蟲yield Item對象時執行RedisPipeline,默認是pickle a. 將item持久化到redis時,指定key和序列化函數 REDIS_ITEMS_KEY = '%(spider)s:items' REDIS_ITEMS_SERIALIZER = 'json.dumps' b. 使用列表保存item數據
配置文件大解讀
# -*- coding: utf-8 -*- # Scrapy settings for redisdepth project # # For simplicity, this file contains only settings considered important or # commonly used. You can find more settings consulting the documentation: # # https://docs.scrapy.org/en/latest/topics/settings.html # https://docs.scrapy.org/en/latest/topics/downloader-middleware.html # https://docs.scrapy.org/en/latest/topics/spider-middleware.html # 爬蟲名稱 BOT_NAME = 'redisdepth' # 爬蟲應用路徑 SPIDER_MODULES = ['redisdepth.spiders'] NEWSPIDER_MODULE = 'redisdepth.spiders' # Crawl responsibly by identifying yourself (and your website) on the user-agent # 客服端user-agent請求頭 #USER_AGENT = 'redisdepth (+http://www.yourdomain.com)' USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36' # 爬蟲君子證書,禁止爬蟲設置 # Obey robots.txt rules # ROBOTSTXT_OBEY = True ROBOTSTXT_OBEY = False # Configure maximum concurrent requests performed by Scrapy (default: 16) # 並發請求數 力度要粗點 #CONCURRENT_REQUESTS = 32 # Configure a delay for requests for the same website (default: 0) # See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay # See also autothrottle settings and docs # 延遲下載秒數 #DOWNLOAD_DELAY = 3 # The download delay setting will honor only one of: # 單域名訪問並發數 並且延遲下次秒數也用在每個域名 #CONCURRENT_REQUESTS_PER_DOMAIN = 16 # 單IP訪問並發數,如果有值則忽略:CONCURRENT_REQUESTS_PER_DOMAIN,並且延遲下次秒數也應用在每個IP #CONCURRENT_REQUESTS_PER_IP = 16 # Disable cookies (enabled by default) # 是否支持cookie,cookiejar進行操作cookie #COOKIES_ENABLED = False # Disable Telnet Console (enabled by default) # Telnet用於查看當前爬蟲的信息,操作爬蟲等... # 使用telnet ip port ,然后通過命令操作 # TELNETCONSOLE_ENABLED = True # TELNETCONSOLE_HOST = '127.0.0.1' # TELNETCONSOLE_PORT = [6023,] #TELNETCONSOLE_ENABLED = False # 默認請求頭 # Override the default request headers: #DEFAULT_REQUEST_HEADERS = { # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', # 'Accept-Language': 'en', #} # 爬蟲中間件 # Enable or disable spider middlewares # See https://docs.scrapy.org/en/latest/topics/spider-middleware.html # SPIDER_MIDDLEWARES = { # # 'redisdepth.middlewares.RedisdepthSpiderMiddleware': 543, # 'redisdepth.sd.Sd1': 666, # 'redisdepth.sd.Sd2': 667, # # } # 下載中間件 # Enable or disable downloader middlewares # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html # DOWNLOADER_MIDDLEWARES = { # # 'redisdepth.middlewares.RedisdepthDownloaderMiddleware': 543, # # 'redisdepth.md.Md1': 666, # # 'redisdepth.md.Md2': 667 # } # 自定義擴展,基於信號進行調用 # Enable or disable extensions # See https://docs.scrapy.org/en/latest/topics/extensions.html EXTENSIONS = { # 'scrapy.extensions.telnet.TelnetConsole': None, 'redisdepth.ext.MyExtension': 666, } # 定義pipeline處理請求 # Configure item pipelines # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html #ITEM_PIPELINES = { # 'redisdepth.pipelines.RedisdepthPipeline': 300, #} """ 自動限速算法 from scrapy.contrib.throttle import AutoThrottle 自動限速設置 1. 獲取最小延遲 DOWNLOAD_DELAY 2. 獲取最大延遲 AUTOTHROTTLE_MAX_DELAY 3. 設置初始下載延遲 AUTOTHROTTLE_START_DELAY 4. 當請求下載完成后,獲取其"連接"時間 latency,即:請求連接到接受到響應頭之間的時間 5. 用於計算的... AUTOTHROTTLE_TARGET_CONCURRENCY target_delay = latency / self.target_concurrency new_delay = (slot.delay + target_delay) / 2.0 # 表示上一次的延遲時間 new_delay = max(target_delay, new_delay) new_delay = min(max(self.mindelay, new_delay), self.maxdelay) slot.delay = new_delay """ # Enable and configure the AutoThrottle extension (disabled by default) # See https://docs.scrapy.org/en/latest/topics/autothrottle.html # 開始自動限速 #AUTOTHROTTLE_ENABLED = True # The initial download delay # 初始下載延遲 #AUTOTHROTTLE_START_DELAY = 5 # The maximum download delay to be set in case of high latencies # 最大下載延遲 #AUTOTHROTTLE_MAX_DELAY = 60 # The average number of requests Scrapy should be sending in parallel to # each remote server # 平均每秒並發數 #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 # Enable showing throttling stats for every response received: # 是否顯示 #AUTOTHROTTLE_DEBUG = False # Enable and configure HTTP caching (disabled by default) # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings """ 啟用緩存 目的用於將已經發送的請求或相應緩存下來,以便以后使用 from scrapy.downloadermiddlewares.httpcache import HttpCacheMiddleware from scrapy.extensions.httpcache import DummyPolicy from scrapy.extensions.httpcache import FilesystemCacheStorage """ # 是否啟用緩存策略 #HTTPCACHE_ENABLED = True # 緩存策略:所有請求均緩存,下次在請求直接訪問原來的緩存即可 # HTTPCACHE_POLICY = "scrapy.extensions.httpcache.DummyPolicy" # 緩存策略:根據Http響應頭:Cache-Control、Last-Modified 等進行緩存的策略 # HTTPCACHE_POLICY = "scrapy.extensions.httpcache.RFC2616Policy" # 緩存超時時間 #HTTPCACHE_EXPIRATION_SECS = 0 # 緩存保存路徑 #HTTPCACHE_DIR = 'httpcache' # 緩存忽略的http狀態碼 #HTTPCACHE_IGNORE_HTTP_CODES = [] # 緩存存儲的插件 #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'