爬虫基础6(框架Scrapy中去重源码与自定义去重)


框架Scrapy中去重源码

源码存储位置

from scrapy.dupefilter import RFPDupeFilter

去重源码解析

from __future__ import print_function
import os
import logging

from scrapy.utils.job import job_dir
from scrapy.utils.request import request_fingerprint


class BaseDupeFilter(object):

    @classmethod
    def from_settings(cls, settings):
        return cls()

    def request_seen(self, request):
        return False

    def open(self):  # can return deferred
        pass

    def close(self, reason):  # can return a deferred
        pass

    def log(self, request, spider):  # log that a request has been filtered
        pass


# 继承了BaseDupeFilter【内部五个方法】
class RFPDupeFilter(BaseDupeFilter):
    """Request Fingerprint duplicates filter"""

    def __init__(self, path=None, debug=False):
        # 将 debug模式改为True可以将集合写入一个文件
        self.file = None
        self.fingerprints = set()
        self.logdupes = True
        self.debug = debug
        self.logger = logging.getLogger(__name__)
        if path:
            self.file = open(os.path.join(path, 'requests.seen'), 'a+')
            self.file.seek(0)
            self.fingerprints.update(x.rstrip() for x in self.file)

    @classmethod
    def from_settings(cls, settings):
        # 指定存储路径
        debug = settings.getbool('DUPEFILTER_DEBUG')
        return cls(job_dir(settings), debug)

    def request_seen(self, request):
        # 将url转换成类似MD5值
        # fp是url的类似md5值
        fp = self.request_fingerprint(request)
        # fingerprints是一个集合 
        if fp in self.fingerprints:
            return True  # 已经访问过
        # 未访问过
        self.fingerprints.add(fp)  # 放到一个集合中
        if self.file:
            self.file.write(fp + os.linesep)

    def request_fingerprint(self, request):
        return request_fingerprint(request)

    def close(self, reason):
        if self.file:
            self.file.close()

    def log(self, request, spider):
        if self.debug:
            msg = "Filtered duplicate request: %(request)s"
            self.logger.debug(msg, {'request': request}, extra={'spider': spider})
        elif self.logdupes:
            msg = ("Filtered duplicate request: %(request)s"
                   " - no more duplicates will be shown"
                   " (see DUPEFILTER_DEBUG to show all duplicates)")
            self.logger.debug(msg, {'request': request}, extra={'spider': spider})
            self.logdupes = False

        spider.crawler.stats.inc_value('dupefilter/filtered', spider=spider)

自定义去重规则

a.编写类【dupefilters.py】

from scrapy.utils.request import request_fingerprint
from scrapy.dupefilter import BaseDupeFilter


class WwwDupeFilter(BaseDupeFilter):
    def __init__(self):
        # 初始化visited_fd为一个集合【也可以放到redis中】
        self.visited_fd = set()

    @classmethod
    def from_settings(cls, settings):
        return cls()

    def request_seen(self, request):
        # request:请求的url【进行类似md5加密的操作】
        # http://www.baidu.com?su=123&456
        # http://www.baidu.com?su=456&123  以上两个的伪md5值时一样的
        # 伪md5值得方法是request_fingerprint
        fd = request_fingerprint(request=request)
        # 如果路径在visited_fd中返回True
        if fd in self.visited_fd:
            return True
        # 添加到集合中
        self.visited_fd.add(fd)

    def open(self):  # can return deferred
        print('开始')

    def close(self, reason):  # can return a deferred
        print('结束')

    def log(self, request, spider):  # log that a request has been filtered
        print('日志')

b.settings.py文件中修改默认去重规则

# 原去重规则位置
# DUPEFILTER_CLASS = 'scrapy.dupefilter.RFPDupeFilter'
# 修改为自定义去重规则位置
DUPEFILTER_CLASS = 'www.dupefilters.WwwDupeFilter'

c.爬虫类中对去重规则的控制

# -*- coding: utf-8 -*-
import scrapy
import sys, os, io
from scrapy.http import Request
from scrapy.dupefilter import RFPDupeFilter
from scrapy.http.response.html import HtmlResponse

from www.items import WwwItem

sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='gb18030')


class ChoutiSpider(scrapy.Spider):
    name = 'chouti'
    allowed_domains = ['chouti.com']
    start_urls = ['http://chouti.com/']

    def parse(self, response):
        # 去子孙中找div并且id="content-list
        # f = open('news.log', mode='a+')
        item_list = response.xpath('//div[@id="content-list"]/div[@class="item"]')
        for item in item_list:
            text = item.xpath('.//a/text()').extract_first()
            hraf = item.xpath('.//a/@href').extract_first()
            yield WwwItem(title=text, href=hraf)
        page_list = response.xpath('//*[@id="dig_lcpage"]//a/@href').extract()
        for page in page_list:
            page = 'https://dig.chouti.com' + page
            yield Request(url=page, callback=self.parse, dont_filter=False) 
            # dont_filter控制去重规则,默认False为遵守,True为不遵守

 scrapy默认使用 scrapy.dupefilter.RFPDupeFilter 进行去重,相关配置有:

DUPEFILTER_CLASS = 'scrapy.dupefilter.RFPDupeFilter'
DUPEFILTER_DEBUG = False
JOBDIR = "保存范文记录的日志路径,如:/root/"  # 最终路径为 /root/requests.seen

 


免责声明!

本站转载的文章为个人学习借鉴使用,本站对版权不负任何法律责任。如果侵犯了您的隐私权益,请联系本站邮箱yoyou2525@163.com删除。



 
粤ICP备18138465号  © 2018-2025 CODEPRJ.COM