Scrapy-redis之RFPDupeFilter、Queue、Scheduler


scrapy-redis去重應用

 1 # -*- coding: utf-8 -*-
 2 import scrapy
 3 from scrapy.http import Request
 4 
 5 
 6 class ChoutiSpider(scrapy.Spider):
 7     name = 'chouti'
 8     allowed_domains = ['chouti.com']
 9     start_urls = ['http://www.chouti.com/']
10 
11     def start_requests(self):
12         url = "http://dig.chouti.com/"
13         yield Request(url=url, callback=self.parse)
14 
15     def parse(self, response):
16         print('response', response)

自定義中間件,過濾重復URL的爬蟲,並且保存redis中

 1 #!/usr/bin/env python
 2 # -*- coding:utf-8 -*-
 3 
 4 import time
 5 from scrapy.dupefilters import BaseDupeFilter
 6 from scrapy.utils.request import request_fingerprint
 7 import redis
 8 from scrapy_redis.dupefilter import RFPDupeFilter
 9 from scrapy_redis.connection import get_redis_from_settings
10 from scrapy_redis import defaults
11 
12 
13 class DupeFilter(BaseDupeFilter):
14     def __init__(self):
15         self.conn = redis.Redis(host='192.168.1.13', port=3306, password='woshinidaye')
16 
17     def request_seen(self, request):
18         fd = request_fingerprint(request)
19         result = self.conn.sadd('visited_urls', fd)
20         if result == 1:
21             return False
22         return True
23 
24 
25 class RedisDupeFilter(RFPDupeFilter):
26     """
27     改下源碼當中存入redis的key值,它源碼里邊是默認是存的時間戳作為key
28     """
29 
30     @classmethod
31     def from_settings(cls, settings):
32         """Returns an instance from given settings.
33 
34         This uses by default the key ``dupefilter:<timestamp>``. When using the
35         ``scrapy_redis.scheduler.Scheduler`` class, this method is not used as
36         it needs to pass the spider name in the key.
37 
38         Parameters
39         ----------
40         settings : scrapy.settings.Settings
41 
42         Returns
43         -------
44         RFPDupeFilter
45             A RFPDupeFilter instance.
46 
47 
48         """
49         server = get_redis_from_settings(settings)
50         # XXX: This creates one-time key. needed to support to use this
51         # class as standalone dupefilter with scrapy's default scheduler
52         # if scrapy passes spider on open() method this wouldn't be needed
53         # TODO: Use SCRAPY_JOB env as default and fallback to timestamp.
54         # key = defaults.DUPEFILTER_KEY % {'timestamp': int(time.time())}  時間戳經常變不好取以后我直接定死
55         key = defaults.DUPEFILTER_KEY % {'timestamp': 'woshinidie'}
56         debug = settings.getbool('DUPEFILTER_DEBUG')
57         return cls(server, key=key, debug=debug)

配置文件

 1 # redis去重配置
 2 REDIS_HOST = '192.168.1.13'                           # 主機名
 3 REDIS_PORT = 3306                                     # 端口
 4 REDIS_PARAMS = {'password': 'woshinidaye'}            # Redis連接參數  默認:REDIS_PARAMS = {'socket_timeout': 30,'socket_connect_timeout': 30,'retry_on_timeout': True,'encoding': REDIS_ENCODING,})
 5 # REDIS_PARAMS['redis_cls'] = 'myproject.RedisClient' # 指定連接Redis的Python模塊  默認:redis.StrictRedis
 6 REDIS_ENCODING = "utf-8"                              # redis編碼類型  默認:'utf-8'
 7 
 8 # REDIS_URL = 'redis://user:pass@hostname:9001'       # 連接URL(優先於以上配置)源碼可以看到
 9 DUPEFILTER_KEY = 'dupefilter:%(timestamp)s'
10 # 純源生的它內部默認是用的以時間戳作為key
11 # DUPEFILTER_CLASS = 'scrapy_redis.dupefilter.RFPDupeFilter'
12 # 我自定義在源碼之上改了保存在redis中的key配置
13 DUPEFILTER_CLASS = 'redisdepth.xxx.RedisDupeFilter'
14 # 自定義redis去重配置
15 # DUPEFILTER_CLASS = 'redisdepth.xxx.DupeFilter'

Scrapy-redis的隊列

  包括:先進先出隊列,后進先出隊列,優先隊列

1.先進先出隊列

 1 #!/usr/bin/env python
 2 # -*- coding:utf-8 -*-
 3 
 4 import redis
 5 
 6 
 7 class FifoQueue(object):
 8     def __init__(self):
 9         self.server = redis.Redis(host='192.168.1.13', port=3306, password='woshinidaye')
10 
11     def push(self, request):
12         """Push a request"""
13         self.server.lpush('User', request)
14 
15     def pop(self):
16         """Pop a request"""
17         data = self.server.rpop('User')
18         return data
19 
20 
21 q = FifoQueue()
22 q.push(11)
23 q.push(22)
24 q.push(33)
25 print(q.pop())
26 # 先進先出隊列 

 2.后進先出隊列

 1 #!/usr/bin/env python
 2 # -*- coding:utf-8 -*-
 3 
 4 import redis
 5 
 6 
 7 class LifoQueue(object):
 8 
 9     def __init__(self):
10         self.server = redis.Redis(host='192.168.1.13', port=3306, password='woshinidaye')
11 
12     def push(self, request):
13         """Push a request"""
14         self.server.lpush('User', request)
15 
16     def pop(self, timeout=0):
17         """Pop a request"""
18         data = self.server.lpop('User')
19         return data

3.優先隊列

 1 #!/usr/bin/env python
 2 # -*- coding:utf-8 -*-
 3 
 4 import redis
 5 
 6 
 7 class PriorityQueue(object):
 8     """Per-spider priority queue abstraction using redis' sorted set"""
 9 
10     def __init__(self):
11         self.server = redis.Redis(host='192.168.1.13', port=3306, password='woshinidaye')
12 
13     def push(self, request, score):
14         """Push a request"""
15         score = -request.priority
16         # We don't use zadd method as the order of arguments change depending on
17         # whether the class is Redis or StrictRedis, and the option of using
18         # kwargs only accepts strings, not bytes.
19         self.server.execute_command('ZADD', 'xxxx', score, request)
20 
21     def pop(self, timeout=0):
22         """
23         Pop a request
24         timeout not support in this queue class
25         """
26         # use atomic range/remove using multi/exec
27         pipe = self.server.pipeline()
28         pipe.multi()
29         pipe.zrange('xxxx', 0, 0).zremrangebyrank('xxxx', 0, 0)
30         results, count = pipe.execute()
31         if results:
32             return results[0]
33 
34 
35 q = PriorityQueue()
36 
37 q.push('ZH', -99)
38 q.push('SB', -66)
39 q.push('JJ', -33)
40 # 如果優先級從小到大廣度優先,從大到小就深度優先
41 print(q.pop())  # 默認取最小的
42 print(q.pop())
43 print(q.pop())

Scheduler源碼分析(我在Notepad++寫了直接貼過來的)

 1 1.找到from scrapy_redis.scheduler import Scheduler 
 2     -執行Scheduler.from_crawler
 3     -執行Scheduler.from_settings
 4         - 讀取配置文件:
 5             SCHEDULER_PERSIST                # 是否在關閉時候保留原來的調度器和去重記錄,True=保留,False=清空
 6             SCHEDULER_FLUSH_ON_START        # 是否在開始之前清空 調度器和去重記錄,True=清空,False=不清空         
 7             SCHEDULER_IDLE_BEFORE_CLOSE        # 去調度器中獲取數據時,如果為空,最多等待時間(最后沒數據,未獲取到)
 8         - 讀取配置文件:
 9             SCHEDULER_QUEUE_KEY                # 調度器中請求存放在redis中的key
10             SCHEDULER_QUEUE_CLASS            # 這里可以選擇三種先進先出、后進先出、優先級,默認使用優先級隊列(默認),其他:PriorityQueue(有序集合),FifoQueue(列表)、LifoQueue(列表)
11             SCHEDULER_DUPEFILTER_KEY        # 去重規則,在redis中保存時對應的key
12             DUPEFILTER_CLASS                # 這里有兩種選擇使用默認或者自己定義的
13                 # 內置比如:DUPEFILTER_CLASS = 'scrapy_redis.dupefilter.RFPDupeFilter'
14                 # 自定義的比如:DUPEFILTER_CLASS = 'redisdepth.xxx.DupeFilter'    這個優先級別高 在源碼里邊是先判斷然后再后續操作    
15             SCHEDULER_SERIALIZER            # 對保存到redis中的數據進行序列化,默認使用pickle
16         - 讀取配置文件:redis-server
17             # 源碼在connection.py中17行
18             REDIS_HOST = '192.168.1.13'                           # 主機名
19             REDIS_PORT = 3306                                     # 端口
20             REDIS_PARAMS = {'password': 'woshinidaye'}            # Redis連接參數  默認:REDIS_PARAMS = {'socket_timeout': 30,'socket_connect_timeout': 30,'retry_on_timeout': True,'encoding': REDIS_ENCODING,})
21             # REDIS_PARAMS['redis_cls'] = 'myproject.RedisClient' # 指定連接Redis的Python模塊  默認:redis.StrictRedis
22             REDIS_ENCODING = "utf-8"                              # redis編碼類型  默認:'utf-8'
23             # REDIS_URL = 'redis://user:pass@hostname:9001'       # 連接URL(優先於以上配置)源碼可以看到
24 2.爬蟲開始執行起始URL
25         - 調用Scheduler.enqueue_request
26         def enqueue_request(self, request):
27             # 請求需要過濾?並且 去重規則是否已經有?(是否已經訪問,如果未訪問添加到去重記錄)request_seen去重規則重要的一個方法
28             if not request.dont_filter and self.df.request_seen(request):
29                 self.df.log(request, self.spider)
30                 # 已經訪問過不再進行訪問
31                 return False
32             if self.stats:
33                 self.stats.inc_value('scheduler/enqueued/redis', spider=self.spider)
34             # 未訪問過,添加到調度器中把這個請求
35             self.queue.push(request)
36             return True
37 3.下載器去調度中獲取任務,去執行任務下載
38         - 調用Scheduler.next_request
39         def next_request(self):
40             block_pop_timeout = self.idle_before_close
41             # 把任務取出來
42             request = self.queue.pop(block_pop_timeout)
43             if request and self.stats:
44             # 此時下載
45                 self.stats.inc_value('scheduler/dequeued/redis', spider=self.spider)
46             return request
47 
48                 

settings需要的配置

 1 # redis去重配置
 2 REDIS_HOST = '192.168.1.13'                           # 主機名
 3 REDIS_PORT = 3306                                     # 端口
 4 REDIS_PARAMS = {'password': 'woshinidaye'}            # Redis連接參數  默認:REDIS_PARAMS = {'socket_timeout': 30,'socket_connect_timeout': 30,'retry_on_timeout': True,'encoding': REDIS_ENCODING,})
 5 # REDIS_PARAMS['redis_cls'] = 'myproject.RedisClient' # 指定連接Redis的Python模塊  默認:redis.StrictRedis
 6 REDIS_ENCODING = "utf-8"                              # redis編碼類型  默認:'utf-8'
 7 
 8 # REDIS_URL = 'redis://user:pass@hostname:9001'       # 連接URL(優先於以上配置)源碼可以看到
 9 DUPEFILTER_KEY = 'dupefilter:%(timestamp)s'
10 # 純源生的它內部默認是用的以時間戳作為key
11 # DUPEFILTER_CLASS = 'scrapy_redis.dupefilter.RFPDupeFilter'
12 # 我自定義在源碼之上改了保存在redis中的key配置
13 DUPEFILTER_CLASS = 'redisdepth.xxx.RedisDupeFilter'
14 # 自定義redis去重配置
15 # DUPEFILTER_CLASS = 'redisdepth.xxx.DupeFilter'
16 
17 
18 # #############調度器配置###########################
19 # from scrapy_redis.scheduler import Scheduler
20 
21 SCHEDULER = "scrapy_redis.scheduler.Scheduler"
22 DEPTH_PRIORITY = 1  # 廣度優先
23 # DEPTH_PRIORITY = -1 # 深度優先
24 SCHEDULER_QUEUE_CLASS = 'scrapy_redis.queue.PriorityQueue'  # 默認使用優先級隊列(默認),其他:PriorityQueue(有序集合),FifoQueue(列表)、LifoQueue(列表)
25 # 廣度優先
26 # SCHEDULER_QUEUE_CLASS = 'scrapy_redis.queue.FifoQueue'
27 # 深度優先
28 # SCHEDULER_QUEUE_CLASS = 'scrapy_redis.queue.LifoQueue'
29 
30 SCHEDULER_QUEUE_KEY = '%(spider)s:requests'         # 調度器中請求存放在redis中的key
31 SCHEDULER_SERIALIZER = "scrapy_redis.picklecompat"  # 對保存到redis中的數據進行序列化,默認使用pickle
32 SCHEDULER_PERSIST = True                            # 是否在關閉時候保留原來的調度器和去重記錄,True=保留,False=清空
33 SCHEDULER_FLUSH_ON_START = True                     # 是否在開始之前清空 調度器和去重記錄,True=清空,False=不清空
34 SCHEDULER_IDLE_BEFORE_CLOSE = 10                    # 去調度器中獲取數據時,如果為空,最多等待時間(最后沒數據,未獲取到)。
35 SCHEDULER_DUPEFILTER_KEY = '%(spider)s:dupefilter'  # 去重規則,在redis中保存時對應的key
36 SCHEDULER_DUPEFILTER_CLASS = 'scrapy_redis.dupefilter.RFPDupeFilter'    # 去重規則對應處理的類

總結:

深度優先:基於層級先進入到最深層級進行處理全部后再往上層級處理
廣度優先:基於從第一層開始,每個層次處理之后進入下一層級處理

先進先出,廣度優先 FifoQueue
后進先出,深度優先 LifoQueue
優先級隊列:
DEPTH_PRIORITY = 1 # 廣度優先
DEPTH_PRIORITY = -1 # 深度優先

 

 

調度器 隊列 DupeFilter三者關系
  調度器:獲取哪個request
  隊列: 存放request
  DupeFilter:對訪問記錄處理

 

 補充點點

定義持久化,爬蟲yield Item對象時執行RedisPipeline,默認是pickle
     
    a. 將item持久化到redis時,指定key和序列化函數
     
        REDIS_ITEMS_KEY = '%(spider)s:items'
        REDIS_ITEMS_SERIALIZER = 'json.dumps'
     
    b. 使用列表保存item數據  

 配置文件大解讀

# -*- coding: utf-8 -*-

# Scrapy settings for redisdepth project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
#     https://docs.scrapy.org/en/latest/topics/settings.html
#     https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
#     https://docs.scrapy.org/en/latest/topics/spider-middleware.html

# 爬蟲名稱
BOT_NAME = 'redisdepth'

# 爬蟲應用路徑
SPIDER_MODULES = ['redisdepth.spiders']
NEWSPIDER_MODULE = 'redisdepth.spiders'


# Crawl responsibly by identifying yourself (and your website) on the user-agent
# 客服端user-agent請求頭
#USER_AGENT = 'redisdepth (+http://www.yourdomain.com)'
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36'

# 爬蟲君子證書,禁止爬蟲設置
# Obey robots.txt rules
# ROBOTSTXT_OBEY = True
ROBOTSTXT_OBEY = False

# Configure maximum concurrent requests performed by Scrapy (default: 16)
# 並發請求數 力度要粗點
#CONCURRENT_REQUESTS = 32

# Configure a delay for requests for the same website (default: 0)
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
# 延遲下載秒數
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
# 單域名訪問並發數 並且延遲下次秒數也用在每個域名
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
# 單IP訪問並發數,如果有值則忽略:CONCURRENT_REQUESTS_PER_DOMAIN,並且延遲下次秒數也應用在每個IP
#CONCURRENT_REQUESTS_PER_IP = 16

# Disable cookies (enabled by default)
# 是否支持cookie,cookiejar進行操作cookie
#COOKIES_ENABLED = False

# Disable Telnet Console (enabled by default)
# Telnet用於查看當前爬蟲的信息,操作爬蟲等...
#    使用telnet ip port ,然后通過命令操作
# TELNETCONSOLE_ENABLED = True
# TELNETCONSOLE_HOST = '127.0.0.1'
# TELNETCONSOLE_PORT = [6023,]
#TELNETCONSOLE_ENABLED = False

# 默認請求頭
# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
#   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
#   'Accept-Language': 'en',
#}

# 爬蟲中間件
# Enable or disable spider middlewares
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
# SPIDER_MIDDLEWARES = {
#    # 'redisdepth.middlewares.RedisdepthSpiderMiddleware': 543,
#     'redisdepth.sd.Sd1': 666,
#     'redisdepth.sd.Sd2': 667,
#
# }

# 下載中間件
# Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
# DOWNLOADER_MIDDLEWARES = {
#    # 'redisdepth.middlewares.RedisdepthDownloaderMiddleware': 543,
#    #  'redisdepth.md.Md1': 666,
#    #  'redisdepth.md.Md2': 667
# }

# 自定義擴展,基於信號進行調用
# Enable or disable extensions
# See https://docs.scrapy.org/en/latest/topics/extensions.html
EXTENSIONS = {
   # 'scrapy.extensions.telnet.TelnetConsole': None,
    'redisdepth.ext.MyExtension': 666,
}

# 定義pipeline處理請求
# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
#ITEM_PIPELINES = {
#    'redisdepth.pipelines.RedisdepthPipeline': 300,
#}

"""
 自動限速算法
    from scrapy.contrib.throttle import AutoThrottle
    自動限速設置
    1. 獲取最小延遲 DOWNLOAD_DELAY
    2. 獲取最大延遲 AUTOTHROTTLE_MAX_DELAY
    3. 設置初始下載延遲 AUTOTHROTTLE_START_DELAY
    4. 當請求下載完成后,獲取其"連接"時間 latency,即:請求連接到接受到響應頭之間的時間
    5. 用於計算的... AUTOTHROTTLE_TARGET_CONCURRENCY
    target_delay = latency / self.target_concurrency
    new_delay = (slot.delay + target_delay) / 2.0 # 表示上一次的延遲時間
    new_delay = max(target_delay, new_delay)
    new_delay = min(max(self.mindelay, new_delay), self.maxdelay)
    slot.delay = new_delay
"""
# Enable and configure the AutoThrottle extension (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
# 開始自動限速
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
# 初始下載延遲
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
# 最大下載延遲
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
# 平均每秒並發數
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
# 是否顯示
#AUTOTHROTTLE_DEBUG = False

# Enable and configure HTTP caching (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
"""
啟用緩存
    目的用於將已經發送的請求或相應緩存下來,以便以后使用
    
    from scrapy.downloadermiddlewares.httpcache import HttpCacheMiddleware
    from scrapy.extensions.httpcache import DummyPolicy
    from scrapy.extensions.httpcache import FilesystemCacheStorage
"""
# 是否啟用緩存策略
#HTTPCACHE_ENABLED = True
# 緩存策略:所有請求均緩存,下次在請求直接訪問原來的緩存即可
# HTTPCACHE_POLICY = "scrapy.extensions.httpcache.DummyPolicy"
# 緩存策略:根據Http響應頭:Cache-Control、Last-Modified 等進行緩存的策略
# HTTPCACHE_POLICY = "scrapy.extensions.httpcache.RFC2616Policy"
# 緩存超時時間
#HTTPCACHE_EXPIRATION_SECS = 0
# 緩存保存路徑
#HTTPCACHE_DIR = 'httpcache'
# 緩存忽略的http狀態碼
#HTTPCACHE_IGNORE_HTTP_CODES = []
# 緩存存儲的插件
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'

 


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM