scrapy數據增量式爬取


爬取過的數據跳過
1、通過url判斷
2、通過數據指紋判斷

 

創建爬蟲項目 :scrapy startproject  xxx

cd xxx

創建爬蟲文件:scrapy genspider -t crawl spidername www.xxx.com

一、根據url判斷

爬蟲文件

 1 # -*- coding: utf-8 -*-
 2 import scrapy
 3 from redis import Redis
 4 from scrapy.linkextractors import LinkExtractor
 5 from scrapy.spiders import CrawlSpider, Rule
 6 from increment1.items import Increment1Item
 7 """
 8 爬取過的數據跳過
 9 1、通過url判斷
10 2、通過數據指紋判斷
11 """
12 
13 
14 class FirstSpider(CrawlSpider):
15     name = 'first'
16     # allowed_domains = ['www.xxx.com']
17     start_urls = ['https://www.4567tv.tv/index.php/vod/show/id/7.html']
18 
19     rules = (
20         Rule(LinkExtractor(allow=r'/index.php/vod/show/id/7/page/\d+\.html'), callback='parse_item', follow=True),
21     )
22 
23     def parse_detail(self, response):
24         item = response['item']
25         actor = response.xpath('//div[@class="stui-content__detail"]/p[3]//text()').extract_first()
26         item['actor'] = actor
27         yield item
28 
29     def parse_item(self, response):
30        
31         conn = Redis(host='127.0.0.1', port=6388)
32         detail_url_list = "https://www.4567tv.tv" + response.xpath('//li[@class="col-md-6 col-sm-4 col-xs-3"]/div/a/@href').extract_first()
33         for url in detail_url_list:
34             item = Increment1Item()
35             ex = conn.sadd('movies_url', url)
36             if ex == 1:   # 說明redis里沒有該url
37                 yield scrapy.Request(url=url, callback=self.parse_detail, meta={"item", item})
38             else:
39                 print('爬過的數據不在爬取!!!')

 

在管道文件里進行存儲

from redis import Redis


class Increment1Pipeline(object):
    def open_spider(self, spider):
        self.conn = Redis(host='127.0.0.1', port=6388)

    def process_item(self, item, spider):
        print('新數據寫入')
        dic = {
            'actor': item['actor']
        }
        self.conn.lpush('move_data', dic)

 

二、根據數據進行指紋識別

爬蟲文件

 1 # -*- coding: utf-8 -*-
 2 import scrapy
 3 from scrapy.linkextractors import LinkExtractor
 4 from scrapy.spiders import CrawlSpider, Rule
 5 from incerment2.items import Incerment2Item
 6 import hashlib
 7 from redis import Redis
 8 
 9 
10 class FirstSpider(CrawlSpider):
11     name = 'first'
12     # allowed_domains = ['www.xxx.com']
13     start_urls = ['https://www.qiushibaike.com/text/']
14 
15     rules = (
16         Rule(LinkExtractor(allow=r'/text/page/\d+/'), callback='parse_item', follow=True),
17     )
18 
19     def parse_item(self, response):
20         self.conn = Redis(host='127.0.0.1', port=6388)
21 
22         div_list = response.xpath('//div[@class="content-left"]/div')
23         for div in div_list:
24             item = Incerment2Item()
25             item['author'] = div.xpath('./div[1]/a[2]/h2/text()').extract_first()
26             content = div.xpath('./a[1]/div/span/text()').extract()
27             content = ''.join(content)
28             item['content'] = content
29 
30             # 將當前爬取的數據做哈希唯一標識(數據指紋)
31             source = item['author']+item['content']
32             hashValue = hashlib.sha256(source.encode()).hexdigest()
33 
34             ex = self.conn.sadd('hashValue', hashValue)
35             if ex == 1:
36                 yield item
37 
38             else:
39                 print('數據未更新')

 


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM