爬取過的數據跳過
1、通過url判斷
2、通過數據指紋判斷
創建爬蟲項目 :scrapy startproject xxx
cd xxx
創建爬蟲文件:scrapy genspider -t crawl spidername www.xxx.com
一、根據url判斷
爬蟲文件
1 # -*- coding: utf-8 -*- 2 import scrapy 3 from redis import Redis 4 from scrapy.linkextractors import LinkExtractor 5 from scrapy.spiders import CrawlSpider, Rule 6 from increment1.items import Increment1Item 7 """ 8 爬取過的數據跳過 9 1、通過url判斷 10 2、通過數據指紋判斷 11 """ 12 13 14 class FirstSpider(CrawlSpider): 15 name = 'first' 16 # allowed_domains = ['www.xxx.com'] 17 start_urls = ['https://www.4567tv.tv/index.php/vod/show/id/7.html'] 18 19 rules = ( 20 Rule(LinkExtractor(allow=r'/index.php/vod/show/id/7/page/\d+\.html'), callback='parse_item', follow=True), 21 ) 22 23 def parse_detail(self, response): 24 item = response['item'] 25 actor = response.xpath('//div[@class="stui-content__detail"]/p[3]//text()').extract_first() 26 item['actor'] = actor 27 yield item 28 29 def parse_item(self, response): 30 31 conn = Redis(host='127.0.0.1', port=6388) 32 detail_url_list = "https://www.4567tv.tv" + response.xpath('//li[@class="col-md-6 col-sm-4 col-xs-3"]/div/a/@href').extract_first() 33 for url in detail_url_list: 34 item = Increment1Item() 35 ex = conn.sadd('movies_url', url) 36 if ex == 1: # 說明redis里沒有該url 37 yield scrapy.Request(url=url, callback=self.parse_detail, meta={"item", item}) 38 else: 39 print('爬過的數據不在爬取!!!')
在管道文件里進行存儲
from redis import Redis class Increment1Pipeline(object): def open_spider(self, spider): self.conn = Redis(host='127.0.0.1', port=6388) def process_item(self, item, spider): print('新數據寫入') dic = { 'actor': item['actor'] } self.conn.lpush('move_data', dic)
二、根據數據進行指紋識別
爬蟲文件
1 # -*- coding: utf-8 -*- 2 import scrapy 3 from scrapy.linkextractors import LinkExtractor 4 from scrapy.spiders import CrawlSpider, Rule 5 from incerment2.items import Incerment2Item 6 import hashlib 7 from redis import Redis 8 9 10 class FirstSpider(CrawlSpider): 11 name = 'first' 12 # allowed_domains = ['www.xxx.com'] 13 start_urls = ['https://www.qiushibaike.com/text/'] 14 15 rules = ( 16 Rule(LinkExtractor(allow=r'/text/page/\d+/'), callback='parse_item', follow=True), 17 ) 18 19 def parse_item(self, response): 20 self.conn = Redis(host='127.0.0.1', port=6388) 21 22 div_list = response.xpath('//div[@class="content-left"]/div') 23 for div in div_list: 24 item = Incerment2Item() 25 item['author'] = div.xpath('./div[1]/a[2]/h2/text()').extract_first() 26 content = div.xpath('./a[1]/div/span/text()').extract() 27 content = ''.join(content) 28 item['content'] = content 29 30 # 將當前爬取的數據做哈希唯一標識(數據指紋) 31 source = item['author']+item['content'] 32 hashValue = hashlib.sha256(source.encode()).hexdigest() 33 34 ex = self.conn.sadd('hashValue', hashValue) 35 if ex == 1: 36 yield item 37 38 else: 39 print('數據未更新')