框架Scrapy中去重源碼
源碼存儲位置
from scrapy.dupefilter import RFPDupeFilter
去重源碼解析
from __future__ import print_function import os import logging from scrapy.utils.job import job_dir from scrapy.utils.request import request_fingerprint class BaseDupeFilter(object): @classmethod def from_settings(cls, settings): return cls() def request_seen(self, request): return False def open(self): # can return deferred pass def close(self, reason): # can return a deferred pass def log(self, request, spider): # log that a request has been filtered pass # 繼承了BaseDupeFilter【內部五個方法】 class RFPDupeFilter(BaseDupeFilter): """Request Fingerprint duplicates filter""" def __init__(self, path=None, debug=False): # 將 debug模式改為True可以將集合寫入一個文件 self.file = None self.fingerprints = set() self.logdupes = True self.debug = debug self.logger = logging.getLogger(__name__) if path: self.file = open(os.path.join(path, 'requests.seen'), 'a+') self.file.seek(0) self.fingerprints.update(x.rstrip() for x in self.file) @classmethod def from_settings(cls, settings): # 指定存儲路徑 debug = settings.getbool('DUPEFILTER_DEBUG') return cls(job_dir(settings), debug) def request_seen(self, request): # 將url轉換成類似MD5值 # fp是url的類似md5值 fp = self.request_fingerprint(request) # fingerprints是一個集合 if fp in self.fingerprints: return True # 已經訪問過 # 未訪問過 self.fingerprints.add(fp) # 放到一個集合中 if self.file: self.file.write(fp + os.linesep) def request_fingerprint(self, request): return request_fingerprint(request) def close(self, reason): if self.file: self.file.close() def log(self, request, spider): if self.debug: msg = "Filtered duplicate request: %(request)s" self.logger.debug(msg, {'request': request}, extra={'spider': spider}) elif self.logdupes: msg = ("Filtered duplicate request: %(request)s" " - no more duplicates will be shown" " (see DUPEFILTER_DEBUG to show all duplicates)") self.logger.debug(msg, {'request': request}, extra={'spider': spider}) self.logdupes = False spider.crawler.stats.inc_value('dupefilter/filtered', spider=spider)
自定義去重規則
a.編寫類【dupefilters.py】
from scrapy.utils.request import request_fingerprint from scrapy.dupefilter import BaseDupeFilter class WwwDupeFilter(BaseDupeFilter): def __init__(self): # 初始化visited_fd為一個集合【也可以放到redis中】 self.visited_fd = set() @classmethod def from_settings(cls, settings): return cls() def request_seen(self, request): # request:請求的url【進行類似md5加密的操作】 # http://www.baidu.com?su=123&456 # http://www.baidu.com?su=456&123 以上兩個的偽md5值時一樣的 # 偽md5值得方法是request_fingerprint fd = request_fingerprint(request=request) # 如果路徑在visited_fd中返回True if fd in self.visited_fd: return True # 添加到集合中 self.visited_fd.add(fd) def open(self): # can return deferred print('開始') def close(self, reason): # can return a deferred print('結束') def log(self, request, spider): # log that a request has been filtered print('日志')
b.settings.py文件中修改默認去重規則
# 原去重規則位置 # DUPEFILTER_CLASS = 'scrapy.dupefilter.RFPDupeFilter' # 修改為自定義去重規則位置 DUPEFILTER_CLASS = 'www.dupefilters.WwwDupeFilter'
c.爬蟲類中對去重規則的控制
# -*- coding: utf-8 -*- import scrapy import sys, os, io from scrapy.http import Request from scrapy.dupefilter import RFPDupeFilter from scrapy.http.response.html import HtmlResponse from www.items import WwwItem sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='gb18030') class ChoutiSpider(scrapy.Spider): name = 'chouti' allowed_domains = ['chouti.com'] start_urls = ['http://chouti.com/'] def parse(self, response): # 去子孫中找div並且id="content-list # f = open('news.log', mode='a+') item_list = response.xpath('//div[@id="content-list"]/div[@class="item"]') for item in item_list: text = item.xpath('.//a/text()').extract_first() hraf = item.xpath('.//a/@href').extract_first() yield WwwItem(title=text, href=hraf) page_list = response.xpath('//*[@id="dig_lcpage"]//a/@href').extract() for page in page_list: page = 'https://dig.chouti.com' + page yield Request(url=page, callback=self.parse, dont_filter=False) # dont_filter控制去重規則,默認False為遵守,True為不遵守
scrapy默認使用 scrapy.dupefilter.RFPDupeFilter 進行去重,相關配置有:
DUPEFILTER_CLASS = 'scrapy.dupefilter.RFPDupeFilter' DUPEFILTER_DEBUG = False JOBDIR = "保存范文記錄的日志路徑,如:/root/" # 最終路徑為 /root/requests.seen