自定義去重
-類。自定義一個類。DUPEFILTER_CLASS = 'sp2.rep.RepeatUrl'
-配置文件中指定 :scrapy.dupefilter.RFPDupeFilter
scrapy默認使用 scrapy.dupefilter.RFPDupeFilter 進行去重,相關配置有:
from scrapy.dupefilter import RFPDupeFilter
UPEFILTER_CLASS = 'scrapy.dupefilter.RFPDupeFilter' DUPEFILTER_DEBUG = False JOBDIR = "保存范文記錄的日志路徑,如:/root/" # 最終路徑為 /root/requests.seen
#DUPEFILTER_CLASS = 'sp2.rep.RepeatUrl'

class RepeatUrl: def __init__(self): self.visited_url = set() @classmethod def from_settings(cls, settings): """ 初始化時,調用 :param settings: :return: """ return cls() def request_seen(self, request): """ 檢測當前請求是否已經被訪問過 :param request: :return: True表示已經訪問過;False表示未訪問過 """ if request.url in self.visited_url: return True self.visited_url.add(request.url) return False def open(self): """ 開始爬去請求時,調用 :return: """ print('open replication') def close(self, reason): """ 結束爬蟲爬取時,調用 :param reason: :return: """ print('close replication') def log(self, request, spider): """ 記錄日志 :param request: :param spider: :return: """ print('repeat', request.url) 自定義URL去重操作