利用scrapy抓取蛋殼公寓上的房源信息,以北京市為例,目標url:https://www.dankegongyu.com/room/bj
思路分析
每次更新最新消息,都是在第一頁上顯示,因此考慮隔一段時間自動抓取第一頁上的房源信息,實現抓取最新消息。
利用redis的set數據結構的特征,將每次抓取后的url存到redis中;
每次請求,將請求url與redis中的url對比,若redis中已存在該url,代表沒有更新,忽略該次請求;若redis中不存在該url,代表該信息是新信息,抓取並將url存入到redis中。
分析頁面源碼,發現該網頁屬於靜態網頁;首先獲取最新頁面每條數據的url,請求該url,得到詳細頁面情況,所有數據均從詳情頁面獲取。
代碼實現
明確抓取字段
# -*- coding: utf-8 -*- # Define here the models for your scraped items # # See documentation in: # http://doc.scrapy.org/en/latest/topics/items.html import scrapy class DankeItem(scrapy.Item): """ 編輯帶爬取信息字段 """ # 數據來源 source = scrapy.Field() # 抓取時間 utc_time = scrapy.Field() # 房間名稱 room_name = scrapy.Field() # 房間租金 room_money = scrapy.Field() # 房間面積 room_area = scrapy.Field() # 房間編號 room_numb = scrapy.Field() # 房間戶型 room_type = scrapy.Field() # 租房方式 rent_type = scrapy.Field() # 房間樓層 room_floor = scrapy.Field() # 所在區域 room_loca = scrapy.Field() # 所在樓盤 estate_name = scrapy.Field()
編寫爬蟲邏輯
# -*- coding: utf-8 -*- import scrapy from scrapy.linkextractors import LinkExtractor from scrapy.spiders import CrawlSpider, Rule from danke.items import DankeItem class DankeSpider(CrawlSpider): # 爬蟲名 name = 'dkgy3' # 允許抓取的url allowed_domains = ['dankegongyu.com'] custom_settings = {'DOWNLOAD_DELAY': 0.2} # 請求開始的url start_urls = ['https://www.dankegongyu.com/room/sz'] # rules屬性 rules = ( #編寫匹配詳情頁的規則,抓取到詳情頁的鏈接后不用跟進 Rule(LinkExtractor(allow=r'https://www.dankegongyu.com/room/\d+'), callback='parse_detail', follow=False), ) def parse_detail(self, response): """ 解析詳情頁數據 :param response: :return: """ node_list = response.xpath('//div[@class="room-detail-right"]') for node in node_list: item = DankeItem() # 房間名稱 room_name = node.xpath('./div/h1/text()') item['room_name'] = room_name.extract_first() # 房間租金 room_money = node.xpath('./div[@class="room-price"]/div/span').xpath('string(.)').extract_first() # 有的房子有首月租金,和普通租金不同,因此匹配方式也不同 if room_money: item['room_money'] = room_money else: room_money = node.xpath('./div[@class="room-price hot"]/div/div[@class="room-price-num"]/text()').extract_first() item['room_money'] = room_money print(room_money) # 房間面積 room_area = node.xpath('./*/div[@class="room-detail-box"]/div[1]/label/text()').extract_first().split(':')[-1] item['room_area'] = room_area # 房間編號 room_numb = node.xpath('./*/div[@class="room-detail-box"]/div[2]/label/text()').extract_first().split(':')[-1] item['room_numb'] = room_numb # 房間戶型 room_type = node.xpath('./*/div[@class="room-detail-box"]/div[3]/label/text()').extract_first().split(':')[-1] item['room_type'] = room_type # 租房方式 rent_type = node.xpath('./*/div[@class="room-detail-box"]/div[3]/label/b/text()').extract_first().split(':')[ -1] item['rent_type'] = rent_type # 所在樓層 room_floor = node.xpath('./div[@class="room-list-box"]/div[2]/div[2]').xpath('string(.)').extract_first().split(':')[-1] item['room_floor'] = room_floor # 所在區域 room_loca = node.xpath('./div[@class="room-list-box"]/div[2]/div[3]/label/div/a[1]/text()').extract_first() item['room_loca'] = room_loca # 所在樓盤 estate_name = node.xpath('./div[@class="room-list-box"]/div[2]/div[3]/label/div/a[3]/text()').extract_first() item['estate_name'] = estate_name yield item
編寫下載中間件
下載中間件中實現兩個邏輯:添加隨機請求頭和url存入redis中
# -*- coding: utf-8 -*- # Define here the models for your spider middleware # # See documentation in: # http://doc.scrapy.org/en/latest/topics/spider-middleware.html import time import random import hashlib import redis from scrapy.exceptions import IgnoreRequest from danke.settings import USER_AGENTS as ua class DankeSpiderMiddleware(object): def process_request(self, request, spider): """ 給每一個請求隨機分配一個代理 :param request: :param spider: :return: """ user_agent = random.choice(ua) request.headers['User-Agent'] = user_agent class DankeRedisMiddleware(object): """ 將第一個頁面上的每一個url放入redis的set類型中,防止重復爬取 """ # 連接redis def __init__(self): self.redis = redis.StrictRedis(host='39.106.116.21', port=6379, db=3) def process_request(self, request, spider): # 將來自詳情頁的鏈接存到redis中 if request.url.endswith(".html"): # MD5加密詳情頁鏈接 url_md5 = hashlib.md5(request.url.encode()).hexdigest() # 添加到redis,添加成功返回True,否則返回False result = self.redis.sadd('dk_url', url_md5) # 添加失敗,說明鏈接已爬取,忽略該請求 if not result: raise IgnoreRequest
數據存儲
# -*- coding: utf-8 -*- from datetime import datetime import pymysql class DankeSourcePipeline(object): def process_item(self, item, spider): item['source'] = spider.name item['utc_time'] = str(datetime.utcnow()) return item class DankePipeline(object): def __init__(self): self.conn = pymysql.connect( host='39.106.116.21', port=3306, database='***', user='***', password='****', charset='utf8' ) # 實例一個游標 self.cursor = self.conn.cursor() def process_item(self, item, spider): sql = ("insert into result_latest(標題, 租金, 面積, " "編號, 戶型, 出租方式, 樓層, " "區域, 樓盤, 抓取時間, 數據來源)" "values (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)") item = dict(item) data = [ item['room_name'], item['room_money'], item['room_area'], item['room_numb'], item['room_type'], item['rent_type'], item['room_floor'], item['room_loca'], item['estate_name'], item['utc_time'], item['source'], ] self.cursor.execute(sql, data) # 提交數據 self.conn.commit() return item def close_spider(self, spider): self.cursor.close() self.conn.close()
實現自動爬取
import os import time while True: """ 每隔20*60*60 自動爬取一次,實現自動更新 """ os.system("scrapy crawl dkgy3") time.sleep(20*60*60) # from scrapy import cmdline # cmdline.execute("scrapy crawl dkgy3".split())
完整代碼
參見:https://github.com/zInPython/danke