框架Scrapy中去重源码
源码存储位置
from scrapy.dupefilter import RFPDupeFilter
去重源码解析
from __future__ import print_function import os import logging from scrapy.utils.job import job_dir from scrapy.utils.request import request_fingerprint class BaseDupeFilter(object): @classmethod def from_settings(cls, settings): return cls() def request_seen(self, request): return False def open(self): # can return deferred pass def close(self, reason): # can return a deferred pass def log(self, request, spider): # log that a request has been filtered pass # 继承了BaseDupeFilter【内部五个方法】 class RFPDupeFilter(BaseDupeFilter): """Request Fingerprint duplicates filter""" def __init__(self, path=None, debug=False): # 将 debug模式改为True可以将集合写入一个文件 self.file = None self.fingerprints = set() self.logdupes = True self.debug = debug self.logger = logging.getLogger(__name__) if path: self.file = open(os.path.join(path, 'requests.seen'), 'a+') self.file.seek(0) self.fingerprints.update(x.rstrip() for x in self.file) @classmethod def from_settings(cls, settings): # 指定存储路径 debug = settings.getbool('DUPEFILTER_DEBUG') return cls(job_dir(settings), debug) def request_seen(self, request): # 将url转换成类似MD5值 # fp是url的类似md5值 fp = self.request_fingerprint(request) # fingerprints是一个集合 if fp in self.fingerprints: return True # 已经访问过 # 未访问过 self.fingerprints.add(fp) # 放到一个集合中 if self.file: self.file.write(fp + os.linesep) def request_fingerprint(self, request): return request_fingerprint(request) def close(self, reason): if self.file: self.file.close() def log(self, request, spider): if self.debug: msg = "Filtered duplicate request: %(request)s" self.logger.debug(msg, {'request': request}, extra={'spider': spider}) elif self.logdupes: msg = ("Filtered duplicate request: %(request)s" " - no more duplicates will be shown" " (see DUPEFILTER_DEBUG to show all duplicates)") self.logger.debug(msg, {'request': request}, extra={'spider': spider}) self.logdupes = False spider.crawler.stats.inc_value('dupefilter/filtered', spider=spider)
自定义去重规则
a.编写类【dupefilters.py】
from scrapy.utils.request import request_fingerprint from scrapy.dupefilter import BaseDupeFilter class WwwDupeFilter(BaseDupeFilter): def __init__(self): # 初始化visited_fd为一个集合【也可以放到redis中】 self.visited_fd = set() @classmethod def from_settings(cls, settings): return cls() def request_seen(self, request): # request:请求的url【进行类似md5加密的操作】 # http://www.baidu.com?su=123&456 # http://www.baidu.com?su=456&123 以上两个的伪md5值时一样的 # 伪md5值得方法是request_fingerprint fd = request_fingerprint(request=request) # 如果路径在visited_fd中返回True if fd in self.visited_fd: return True # 添加到集合中 self.visited_fd.add(fd) def open(self): # can return deferred print('开始') def close(self, reason): # can return a deferred print('结束') def log(self, request, spider): # log that a request has been filtered print('日志')
b.settings.py文件中修改默认去重规则
# 原去重规则位置 # DUPEFILTER_CLASS = 'scrapy.dupefilter.RFPDupeFilter' # 修改为自定义去重规则位置 DUPEFILTER_CLASS = 'www.dupefilters.WwwDupeFilter'
c.爬虫类中对去重规则的控制
# -*- coding: utf-8 -*- import scrapy import sys, os, io from scrapy.http import Request from scrapy.dupefilter import RFPDupeFilter from scrapy.http.response.html import HtmlResponse from www.items import WwwItem sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='gb18030') class ChoutiSpider(scrapy.Spider): name = 'chouti' allowed_domains = ['chouti.com'] start_urls = ['http://chouti.com/'] def parse(self, response): # 去子孙中找div并且id="content-list # f = open('news.log', mode='a+') item_list = response.xpath('//div[@id="content-list"]/div[@class="item"]') for item in item_list: text = item.xpath('.//a/text()').extract_first() hraf = item.xpath('.//a/@href').extract_first() yield WwwItem(title=text, href=hraf) page_list = response.xpath('//*[@id="dig_lcpage"]//a/@href').extract() for page in page_list: page = 'https://dig.chouti.com' + page yield Request(url=page, callback=self.parse, dont_filter=False) # dont_filter控制去重规则,默认False为遵守,True为不遵守
scrapy默认使用 scrapy.dupefilter.RFPDupeFilter 进行去重,相关配置有:
DUPEFILTER_CLASS = 'scrapy.dupefilter.RFPDupeFilter' DUPEFILTER_DEBUG = False JOBDIR = "保存范文记录的日志路径,如:/root/" # 最终路径为 /root/requests.seen