# 安裝redis服務器端 sudo apt-get install redis-server # 安裝scrapy和scrapy-redis庫 pip install scrapy pip install scrapy-redis
# 終端1 $ redis-cli # 終端2(在spider目錄下打開,並切換到虛擬環境) $ scrapy runspider bludv.py # 終端1 $ lpush bludv:start_urls https://www.bludv.tv
# 終端1: # 刪除當前數據庫中的所有Key flushdb # 刪除所有數據庫中的key flushall //下面的命令指定數據序號為0,即默認數據庫 redis-cli -n 0 keys "*" | xargs redis-cli -n 0 del
settings.py代碼需要更改的:
# 啟動scrapy_redis的調度器, 在 redis 數據庫里分配請求 SCHEDULER = "scrapy_redis.scheduler.Scheduler" DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter" # 啟用去重功能 # 默認的scrapy-redis請求隊列形式 SCHEDULER_QUEUE_CLASS = "scrapy_redis.queue.SpiderPriorityQueue" # 請求調度使用FIFO隊列 SCHEDULER_QUEUE_CLASS = 'scrapy_redis.queue.SpiderQueue' # 可選的 按后進先出排序(LIFO) # SCHEDULER_QUEUE_CLASS = 'scrapy_redis.queue.SpiderStack' # 不清除Redis隊列、這樣可以暫停/恢復 爬取 # SCHEDULER_PERSIST = True # REDIS_URL = 'redis://192.168.2.223:6379' # 騰訊服務器 # # Specify the host and port to use when connecting to Redis (optional). REDIS_HOST = 'localhost' REDIS_PORT = 6379 # COOKIES_ENABLED = False # 禁用 cookies # DOWNLOAD_DELAY = 4 # 設置下載延遲 # DOWNLOAD_TIMEOUT = 10 # 請求超時 # 重新請求 RETRY_ENABLED = True # 重試次數 RETRY_TIMES = 8 AUTOTHROTTLE_ENABLED = True # 防止遺漏 # Configure item pipelines # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html ITEM_PIPELINES = { 'BludvSpider.pipelines.BludvspiderPipeline': 300, 'BludvSpider.pipelines.MongodbWithPymongoPipeline': 301, # 使用 PyMongo 寫入 MongoDB 'scrapy_redis.pipelines.RedisPipeline': 100, # scrapy-redis 分布式 }
bludv.py需要更改的:
class BludvSpider(scrapy.Spider): # 改為 from scrapy_redis.spiders import RedisSpider class BludvSpider(RedisSpider):
# start_urls = [ # # 'https://www.bludv.tv/category/series/' # # 'https://www.bludv.tv/category/series/page/56/' # # 'https://www.bludv.tv/o-protetor-2-torrent-2018-dublado-dual-audio-legendado-bluray-720p-e-1080p-download/' # "https://www.bludv.tv" # ] 改為 redis_key = "bludv:start_urls"