爬蟲基礎6(框架Scrapy中去重源碼與自定義去重)


框架Scrapy中去重源碼

源碼存儲位置

from scrapy.dupefilter import RFPDupeFilter

去重源碼解析

from __future__ import print_function
import os
import logging

from scrapy.utils.job import job_dir
from scrapy.utils.request import request_fingerprint


class BaseDupeFilter(object):

    @classmethod
    def from_settings(cls, settings):
        return cls()

    def request_seen(self, request):
        return False

    def open(self):  # can return deferred
        pass

    def close(self, reason):  # can return a deferred
        pass

    def log(self, request, spider):  # log that a request has been filtered
        pass


# 繼承了BaseDupeFilter【內部五個方法】
class RFPDupeFilter(BaseDupeFilter):
    """Request Fingerprint duplicates filter"""

    def __init__(self, path=None, debug=False):
        # 將 debug模式改為True可以將集合寫入一個文件
        self.file = None
        self.fingerprints = set()
        self.logdupes = True
        self.debug = debug
        self.logger = logging.getLogger(__name__)
        if path:
            self.file = open(os.path.join(path, 'requests.seen'), 'a+')
            self.file.seek(0)
            self.fingerprints.update(x.rstrip() for x in self.file)

    @classmethod
    def from_settings(cls, settings):
        # 指定存儲路徑
        debug = settings.getbool('DUPEFILTER_DEBUG')
        return cls(job_dir(settings), debug)

    def request_seen(self, request):
        # 將url轉換成類似MD5值
        # fp是url的類似md5值
        fp = self.request_fingerprint(request)
        # fingerprints是一個集合 
        if fp in self.fingerprints:
            return True  # 已經訪問過
        # 未訪問過
        self.fingerprints.add(fp)  # 放到一個集合中
        if self.file:
            self.file.write(fp + os.linesep)

    def request_fingerprint(self, request):
        return request_fingerprint(request)

    def close(self, reason):
        if self.file:
            self.file.close()

    def log(self, request, spider):
        if self.debug:
            msg = "Filtered duplicate request: %(request)s"
            self.logger.debug(msg, {'request': request}, extra={'spider': spider})
        elif self.logdupes:
            msg = ("Filtered duplicate request: %(request)s"
                   " - no more duplicates will be shown"
                   " (see DUPEFILTER_DEBUG to show all duplicates)")
            self.logger.debug(msg, {'request': request}, extra={'spider': spider})
            self.logdupes = False

        spider.crawler.stats.inc_value('dupefilter/filtered', spider=spider)

自定義去重規則

a.編寫類【dupefilters.py】

from scrapy.utils.request import request_fingerprint
from scrapy.dupefilter import BaseDupeFilter


class WwwDupeFilter(BaseDupeFilter):
    def __init__(self):
        # 初始化visited_fd為一個集合【也可以放到redis中】
        self.visited_fd = set()

    @classmethod
    def from_settings(cls, settings):
        return cls()

    def request_seen(self, request):
        # request:請求的url【進行類似md5加密的操作】
        # http://www.baidu.com?su=123&456
        # http://www.baidu.com?su=456&123  以上兩個的偽md5值時一樣的
        # 偽md5值得方法是request_fingerprint
        fd = request_fingerprint(request=request)
        # 如果路徑在visited_fd中返回True
        if fd in self.visited_fd:
            return True
        # 添加到集合中
        self.visited_fd.add(fd)

    def open(self):  # can return deferred
        print('開始')

    def close(self, reason):  # can return a deferred
        print('結束')

    def log(self, request, spider):  # log that a request has been filtered
        print('日志')

b.settings.py文件中修改默認去重規則

# 原去重規則位置
# DUPEFILTER_CLASS = 'scrapy.dupefilter.RFPDupeFilter'
# 修改為自定義去重規則位置
DUPEFILTER_CLASS = 'www.dupefilters.WwwDupeFilter'

c.爬蟲類中對去重規則的控制

# -*- coding: utf-8 -*-
import scrapy
import sys, os, io
from scrapy.http import Request
from scrapy.dupefilter import RFPDupeFilter
from scrapy.http.response.html import HtmlResponse

from www.items import WwwItem

sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='gb18030')


class ChoutiSpider(scrapy.Spider):
    name = 'chouti'
    allowed_domains = ['chouti.com']
    start_urls = ['http://chouti.com/']

    def parse(self, response):
        # 去子孫中找div並且id="content-list
        # f = open('news.log', mode='a+')
        item_list = response.xpath('//div[@id="content-list"]/div[@class="item"]')
        for item in item_list:
            text = item.xpath('.//a/text()').extract_first()
            hraf = item.xpath('.//a/@href').extract_first()
            yield WwwItem(title=text, href=hraf)
        page_list = response.xpath('//*[@id="dig_lcpage"]//a/@href').extract()
        for page in page_list:
            page = 'https://dig.chouti.com' + page
            yield Request(url=page, callback=self.parse, dont_filter=False) 
            # dont_filter控制去重規則,默認False為遵守,True為不遵守

 scrapy默認使用 scrapy.dupefilter.RFPDupeFilter 進行去重,相關配置有:

DUPEFILTER_CLASS = 'scrapy.dupefilter.RFPDupeFilter'
DUPEFILTER_DEBUG = False
JOBDIR = "保存范文記錄的日志路徑,如:/root/"  # 最終路徑為 /root/requests.seen

 


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM