python爬取北京貝殼找房網數據
一,選題背景
貝殼找房業務涉及二手房,新房,租房,商業辦公等。平台擁有全面真實的房源信息,為需要找房的人提高安全可靠的購房體驗。對北京貝殼找房網進行數據爬取
要達到的數據分析的預期目標是:
1,對爬取的房源信息進行可視化處理。
2,預期目標歸類二手房源進行可視化處理。
二,爬蟲設計方案
1,爬蟲名稱:
爬取北京貝殼找房網數據可視化處理。
2,爬蟲爬取的內容與數據特征分析:
目標網站是北京貝殼找房網,其原理主要是通過Requests獲取Json請求,從而得到北京市房源數據
3. 方案概述
分析網站頁面結構,找到爬取數據的位置,根據不同的數據制定不同的爬取方法,將爬取的數據保存成csv文件,然后再將csv文件里的數據進行可視化處理。
第一步 分析網站
第二步 發送請求並獲取Json數據
第三步 獲取北京市房源數據數據
第四步 繪制柱狀圖等
三,主題頁面的結構特征分析
1,主題頁面的結構與特性分析
通過瀏覽器“審查元素”查看源代碼及“網絡”反饋的消息(按f12可以獲取),如下圖所示:
網站html頁面結構分析
四,爬蟲程序設計
1. 數據的爬取
(1)北京市房源數據的爬取
import requests import time from multiprocessing import Pool from lxml import etree import pandas as pd import os import random # 獲取房源的基本url # 參數page def get_home_url(page): url = 'http://bj.ke.com/ershoufang/pg{}/'.format(page) headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36', 'Cookie': 'lianjia_uuid=e6a91b7a-b6a4-40b5-88c6-ff67759cbc8a; crosSdkDT2019DeviceId=-51npj6--xbmlw5-f22i5qg8bh36ouv-yttqkmwdf; _ga=GA1.2.121082359.1579583230; ke_uuid=6de1afa21a5799c0874702af39248907; __xsptplus788=788.1.1579583230.1579583347.4%234%7C%7C%7C%7C%7C%23%23Q6jl-k46IlXjCORdTOp6O3JyzHokoUrb%23; select_city=110000; digv_extends=%7B%22utmTrackId%22%3A%2280418605%22%7D; lianjia_ssid=a4ab1bc0-cb04-492f-960c-342c66065da0; Hm_lvt_9152f8221cb6243a53c83b956842be8a=1583897013,1583932737; User-Realip=111.196.247.121; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%2216fc67f100b140-06f07f8f707639-33365a06-1049088-16fc67f100c603%22%2C%22%24device_id%22%3A%2216fc67f100b140-06f07f8f707639-33365a06-1049088-16fc67f100c603%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E7%9B%B4%E6%8E%A5%E6%B5%81%E9%87%8F%22%2C%22%24latest_referrer%22%3A%22%22%2C%22%24latest_referrer_host%22%3A%22%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC_%E7%9B%B4%E6%8E%A5%E6%89%93%E5%BC%80%22%2C%22%24latest_utm_source%22%3A%22baidu%22%2C%22%24latest_utm_medium%22%3A%22pinzhuan%22%2C%22%24latest_utm_campaign%22%3A%22wybeijing%22%2C%22%24latest_utm_content%22%3A%22biaotimiaoshu%22%2C%22%24latest_utm_term%22%3A%22biaoti%22%7D%7D; Hm_lpvt_9152f8221cb6243a53c83b956842be8a=1583933576; srcid=eyJ0Ijoie1wiZGF0YVwiOlwiMjAxZjBjNWU1ZWE1ZGVmYjQxZDFlYTE4MGVkNWI1OGRjYzk5Mzc2MjEwNTcyMWI3ODhiNTQyNTExOGQ1NTVlZDNkMTY2MWE2YWI5YWRlMGY0NDA3NjkwNWEyMzRlNTdhZWExNDViNGFiNWVmMmMyZWJlZGY1ZjM2Y2M0NWIxMWZlMWFiOWI2MDJiMzFmOTJmYzgxNzNiZTIwMzE1ZGJjNTUyMWE2ZjcxYzZmMTFhOWIyOWU2NzJkZTkyZjc3ZDk1MzhiNjhhMTQyZDQ2YmEyNjJhYzJmNjdjNmFjM2I5YzU0MzdjMDkwYWUwMzZmZjVjYWZkZTY5YjllYzY0NzEwMWY2OTc1NmU1Y2ExNzNhOWRmZTdiNGY4M2E1Zjc2NDZmY2JkMGM2N2JiMjdmZTJjNjI2MzNkMjdlNDY4ODljZGRjMjc3MTQ0NDUxMDllZThlZDVmZmMwMjViNjc2ZjFlY1wiLFwia2V5X2lkXCI6XCIxXCIsXCJzaWduXCI6XCJkMDI2MDk0N1wifSIsInIiOiJodHRwczovL2JqLmtlLmNvbS9lcnNob3VmYW5nLzE5MTExMzE5NTEwMTAwMTcxNzU5Lmh0bWwiLCJvcyI6IndlYiIsInYiOiIwLjEifQ==' } text = requests.get(url,headers=headers).text html = etree.HTML(text) detail_url = html.xpath('//ul[@class="sellListContent"]//li[@class="clear"]/a/@href') return detail_url # 獲取房源詳細數據信息 def get_home_detail_infos(detail_url): headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36', 'Cookie': 'lianjia_uuid=e6a91b7a-b6a4-40b5-88c6-ff67759cbc8a; crosSdkDT2019DeviceId=-51npj6--xbmlw5-f22i5qg8bh36ouv-yttqkmwdf; _ga=GA1.2.121082359.1579583230; ke_uuid=6de1afa21a5799c0874702af39248907; __xsptplus788=788.1.1579583230.1579583347.4%234%7C%7C%7C%7C%7C%23%23Q6jl-k46IlXjCORdTOp6O3JyzHokoUrb%23; select_city=110000; digv_extends=%7B%22utmTrackId%22%3A%2280418605%22%7D; lianjia_ssid=a4ab1bc0-cb04-492f-960c-342c66065da0; Hm_lvt_9152f8221cb6243a53c83b956842be8a=1583897013,1583932737; User-Realip=111.196.247.121; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%2216fc67f100b140-06f07f8f707639-33365a06-1049088-16fc67f100c603%22%2C%22%24device_id%22%3A%2216fc67f100b140-06f07f8f707639-33365a06-1049088-16fc67f100c603%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E7%9B%B4%E6%8E%A5%E6%B5%81%E9%87%8F%22%2C%22%24latest_referrer%22%3A%22%22%2C%22%24latest_referrer_host%22%3A%22%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC_%E7%9B%B4%E6%8E%A5%E6%89%93%E5%BC%80%22%2C%22%24latest_utm_source%22%3A%22baidu%22%2C%22%24latest_utm_medium%22%3A%22pinzhuan%22%2C%22%24latest_utm_campaign%22%3A%22wybeijing%22%2C%22%24latest_utm_content%22%3A%22biaotimiaoshu%22%2C%22%24latest_utm_term%22%3A%22biaoti%22%7D%7D; Hm_lpvt_9152f8221cb6243a53c83b956842be8a=1583933576; srcid=eyJ0Ijoie1wiZGF0YVwiOlwiMjAxZjBjNWU1ZWE1ZGVmYjQxZDFlYTE4MGVkNWI1OGRjYzk5Mzc2MjEwNTcyMWI3ODhiNTQyNTExOGQ1NTVlZDNkMTY2MWE2YWI5YWRlMGY0NDA3NjkwNWEyMzRlNTdhZWExNDViNGFiNWVmMmMyZWJlZGY1ZjM2Y2M0NWIxMWZlMWFiOWI2MDJiMzFmOTJmYzgxNzNiZTIwMzE1ZGJjNTUyMWE2ZjcxYzZmMTFhOWIyOWU2NzJkZTkyZjc3ZDk1MzhiNjhhMTQyZDQ2YmEyNjJhYzJmNjdjNmFjM2I5YzU0MzdjMDkwYWUwMzZmZjVjYWZkZTY5YjllYzY0NzEwMWY2OTc1NmU1Y2ExNzNhOWRmZTdiNGY4M2E1Zjc2NDZmY2JkMGM2N2JiMjdmZTJjNjI2MzNkMjdlNDY4ODljZGRjMjc3MTQ0NDUxMDllZThlZDVmZmMwMjViNjc2ZjFlY1wiLFwia2V5X2lkXCI6XCIxXCIsXCJzaWduXCI6XCJkMDI2MDk0N1wifSIsInIiOiJodHRwczovL2JqLmtlLmNvbS9lcnNob3VmYW5nLzE5MTExMzE5NTEwMTAwMTcxNzU5Lmh0bWwiLCJvcyI6IndlYiIsInYiOiIwLjEifQ==' } detail_text = requests.get(detail_url,headers=headers).text html = etree.HTML(detail_text) all_data = [] # 解析獲取相關數據 # 所在地址 home_location = html.xpath('//div[@data-component="overviewIntro"]//div[@class="content"]//div[@class="areaName"]/span[@class="info"]/a/text()') all_data.append(home_location) # 小區名稱 local_name = html.xpath('//div[@data-component="overviewIntro"]//div[@class="content"]//div[@class="communityName"]/a/text()')[0] all_data.append(local_name) # 總價格 total_price = html.xpath('//div[@data-component="overviewIntro"]//div[@class="content"]//div[@class="price "]/span[@class="total"]/text()')[0] all_data.append(total_price) # 單價 unit_price = html.xpath('//div[@data-component="overviewIntro"]//div[@class="content"]//div[@class="price "]//div[@class="unitPrice"]/span/text()')[0] all_data.append(unit_price) # 房屋基本信息 home_style = html.xpath('//div[@class="introContent"]//div[@class="base"]//div[@class="content"]/ul/li/text()') all_data.append(home_style) # 房屋交易屬性信息 transaction_info = html.xpath('//div[@class="introContent"]//div[@class="transaction"]//div[@class="content"]/ul/li/text()') all_data.append(transaction_info) # 小區均價 xiaoqu_price = html.xpath('//div[@class="xiaoquCard"]//div[@class="xiaoqu_main fl"]//span[@class="xiaoqu_main_info price_red"]/text()')[0].replace(' ','') all_data.append(xiaoqu_price) # 小區建造時間 xiaoqu_built_time = html.xpath('//div[@class="xiaoquCard"]//div[@class="xiaoqu_main fl"]//span[@class="xiaoqu_main_info"]/text()')[0].replace(' ','').replace('\n','') all_data.append(xiaoqu_built_time) # 小區建築類型 xiaoqu_built_style = html.xpath('//div[@class="xiaoquCard"]//div[@class="xiaoqu_main fl"]//span[@class="xiaoqu_main_info"]/text()')[1].replace(' ','').replace('\n','') all_data.append(xiaoqu_built_style) # 小區樓層總數 xiaoqu_total_ceng = html.xpath('//div[@class="xiaoquCard"]//div[@class="xiaoqu_main fl"]//span[@class="xiaoqu_main_info"]/text()')[2].replace(' ','').replace('\n','') all_data.append(xiaoqu_total_ceng) return all_data # 數據保存至csv文件里(使用pandas中的to_csv保存) def save_data(data): data_frame = pd.DataFrame(data,columns=['小區位置','小區名稱','房屋總價','房屋單價','房屋基本信息','房屋交易信息','小區均價','小區建造時間','小區房屋類型','小區層數']) print(data_frame) data_frame.to_csv('beijing_fang111.csv',header=False,index=False,mode='a',encoding='utf_8_sig') def main(page): print('開始爬取第{}頁的數據!'.format(page)) # choice_time = random.choice(range(0,5)) # print(choice_time) urls = get_home_url(page) for url in urls: print('開始爬去詳細網頁為{}的房屋詳細信息資料!'.format(url)) all_data = get_home_detail_infos(detail_url=url) data = [] data.append(all_data) save_data(data) if __name__ == "__main__": page = range(0,100) print('爬蟲開始') pool = Pool(processes=4) pool.map(main,page) # proxies = proxy.get_proxy_random() # pool.apply_async(main,args=(page,proxies,)) pool.close() pool.join()
#結構截圖
#對數據進行處理
import pandas as pd import numpy as np from matplotlib import pyplot as plt import csv data = pd.read_excel(r"F:\0_個人學習\beike_find_house.xlsx", header=None)
data.columns = ['區/縣','區域','小區','總價','單價','房屋戶型','樓層','總面積','戶型結構','套內面積','建築類型','朝向','建築結構','裝修情況','梯戶比例','供暖方式','配備電梯','產權年限','s','交易權屬','u','形式','是否滿五','產權形式','是否有房本','小區均價','小區建成','style','總棟數']
data.head()
#結果截圖
#數據清洗
data['裝修情況'] = data.apply(lambda x:x['建築類型'] if ('南北' in str(x['戶型結構'])) else x['裝修情況'],axis=1) data['建築結構'] = data.apply(lambda x:x['套內面積'] if ('南北' in str(x['戶型結構'])) else x['建築結構'],axis=1) data['朝向'] = data.apply(lambda x:x['戶型結構'] if ('南北' in str(x['戶型結構'])) else x['朝向'],axis=1) data['套內面積'] = data.apply(lambda x:'㎡' if ('南北' in str(x['戶型結構'])) else x['套內面積'],axis=1) data['裝修情況'] = data.apply(lambda x:x['朝向'] if ('㎡' in str(x['戶型結構'])) else x['裝修情況'],axis=1) data['建築結構'] = data.apply(lambda x:x['建築類型'] if ('㎡' in str(x['戶型結構'])) else x['建築結構'],axis=1) data['朝向'] = data.apply(lambda x:x['套內面積'] if ('㎡' in str(x['戶型結構'])) else x['朝向'],axis=1) data['套內面積'] = data.apply(lambda x:'㎡' if ('㎡' in str(x['戶型結構'])) else x['套內面積'],axis=1) data['套內面積'] = data.apply(lambda x:'㎡' if ('暫無數據' in str(x['套內面積'])) else x['套內面積'],axis=1) data['裝修情況'] = data.apply(lambda x:x['裝修情況'] if ('㎡' in str(x['套內面積'])) else x['建築結構'],axis=1) data['建築結構'] = data.apply(lambda x:x['建築結構'] if ('㎡' in str(x['套內面積'])) else x['朝向'],axis=1) data['朝向'] = data.apply(lambda x:x['朝向'] if ('㎡' in str(x['套內面積'])) else x['建築類型'],axis=1) data['建築類型'] = data.apply(lambda x:x['建築類型'] if ('㎡' in str(x['套內面積'])) else x['套內面積'],axis=1) data['套內面積'] = data.apply(lambda x:x['套內面積'] if ('㎡' in str(x['套內面積'])) else '無信息',axis=1) data['裝修情況'] = data.apply(lambda x:x['建築結構'] if (('戶') in str(x['裝修情況'])) else x['裝修情況'],axis=1) data['建築結構'] = data.apply(lambda x:x['朝向'] if (('戶') in str(x['裝修情況'])) else x['建築結構'],axis=1) data['朝向'] = data.apply(lambda x:x['建築類型'] if (('戶') in str(x['裝修情況'])) else x['朝向'],axis=1) data['建築結構'] = data.apply(lambda x:x['朝向'] if ('結構' in str(x['朝向'])) else x['建築結構'],axis=1) data['朝向'] = data.apply(lambda x:x['建築類型'] if ('結構' in str(x['朝向'])) else x['朝向'],axis=1) data['總樓層'] = data.apply(lambda x:str(x[6])[3:].strip('(共').strip('層)'),axis=1) data['樓層'] = data.apply(lambda x:str(x[6])[:3],axis=1) data['總面積'] = data.apply(lambda x:str(x[7]).strip('㎡'),axis=1) data['小區均價'] = data.apply(lambda x:str(x[-5]).strip('元/㎡\n').strip('\n'),axis=1) data['小區建成'] = data.apply(lambda x:str(x[-4])[:4],axis=1) data['總棟數'] = data.apply(lambda x:str(x[-2])[:-1],axis=1)
data.to_csv('after_deal_data.csv',encoding='utf_8_sig')
need_data = data[['區/縣','區域','小區','總價','單價','房屋戶型','樓層','總面積','朝向','建築結構','裝修情況','交易權屬','形式','是否滿五','產權形式','是否有房本','小區均價','小區建成','總棟數']]
need_data.head()
#結果截圖
need_data.info()
need_data.describe()
plt.rcParams['font.sans-serif'] = ['SimHei'] # 步驟一(替換sans-serif字體) plt.rcParams['axes.unicode_minus'] = False # 步驟二(解決坐標軸負數的負號顯示問題) fig, ax=plt.subplots() ''' 各區縣房源分布情況!!! ''' need_data['區/縣'].value_counts().plot(kind='bar',color=['green','red','blue','grey','pink'],alpha=0.5) plt.title('北京二手房各區、縣房源分布信息!',fontsize=15) plt.xlabel('區、縣名稱',fontsize=15) plt.ylabel('房源數量',fontsize=15) plt.grid(linestyle=":", color="r") plt.xticks(rotation=60) plt.legend() plt.show()
#結果截圖
''' 各區縣房源均價分布情況!!! ''' need_data.groupby('區/縣').mean()['單價'].sort_values(ascending=True).plot(kind='barh',color=['r','g','y','b'],alpha=0.5) plt.title('北京二手房各區、縣房屋均價分布信息!',fontsize=15) plt.xlabel('房屋均價',fontsize=15) plt.ylabel('區、縣名稱',fontsize=15) plt.grid(linestyle=":", color="r") plt.legend() plt.show()
#結果截圖
''' 房屋戶型情況 ''' room_style = need_data['房屋戶型'].value_counts() print(room_style) need_data['房屋戶型'].value_counts()[:10].plot(kind='bar',color='grey') plt.title('北京二手房房屋戶型情況',fontsize=15,color='red') plt.xlabel('房屋戶型',fontsize=15) plt.ylabel('房源數量',fontsize=15) plt.grid(linestyle=":", color="r") plt.legend() plt.xticks(rotation=60) # ax.spines['top'].set_visible(False) # ax.spines['right'].set_visible(False) plt.show()
need_data[need_data.房屋戶型 == '1室0廳2衛']
# 北京二手房總價最大、最小值及其房源信息 total_price_min = need_data['總價'].min() total_price_min_room_info = need_data[need_data.總價==total_price_min] print('二手房總價最低價位為:\n{}'.format(total_price_min)) print('二手房總價最低的房源信息為:\n{}'.format(total_price_min_room_info)) total_price_max = need_data['總價'].max() total_price_max_room_info = need_data[need_data.總價==total_price_max] print('二手房總價最高價位為:\n{}'.format(total_price_max)) print('二手房總價最低的房源信息為:\n{}'.format(total_price_max_room_info))
# 繪制總面積和總價的散點關系圖 home_area = need_data['總面積'].apply(lambda x:float(x)) # print(home_area.head()) total_price = need_data['總價'] # print(total_price.head()) plt.scatter(home_area,total_price,s=3) plt.title('北京二手房房屋戶型情況',fontsize=15) plt.xlabel('房屋面積',fontsize=15) plt.ylabel('房源總價',fontsize=15) plt.grid(linestyle=":", color="r") plt.show()
# 分析兩個面積大但是價格較低的房源 area_max = home_area.max() area_max_room_info = need_data[home_area==area_max] print('二手房面積最大的房源信息為:\n{}'.format(area_max_room_info))
''' 裝修情況的房源分布情況!!! ''' need_data['裝修情況'].value_counts().plot(kind='bar',color=['g','r','y','b'],alpha=0.5) plt.title('北京二手房裝修情況的房源分布信息!',fontsize=15) plt.xlabel('裝修類型',fontsize=15) plt.ylabel('房屋均價',fontsize=15) plt.grid(linestyle=":", color="r") plt.legend() plt.xticks(rotation=0) plt.show()
''' 裝修情況的均價分布情況!!! ''' need_data.groupby('裝修情況').mean()['單價'].plot(kind='bar',color=['g','r','y','b'],alpha=0.5) plt.title('北京二手房裝修與房屋均價分布信息!',fontsize=15) plt.xlabel('裝修類型',fontsize=15) plt.ylabel('房屋均價',fontsize=15) plt.grid(linestyle=":", color="r") plt.legend() plt.xticks(rotation=0) plt.show()
# 小區均價數據的清洗處理 # 由於小區均價中存在暫無數據的情況,本次使用單價的形式代替房屋均價 need_data = need_data.copy() need_data['小區均價'] = need_data.apply(lambda x: x['單價'] if ('暫無數據' in str(x['小區均價'])) else x['小區均價'],axis=1) avg_price = need_data['小區均價'].astype('float') print('小區均價最高的價格是:{}'.format(avg_price.max())) print('小區均價最低的價格是:{}'.format(avg_price.min()))
# 小區均價最低的房源信息 need_data[need_data['小區均價'].astype('float')==avg_price.min()]
# 小區均價最高的房源信息 need_data[need_data['小區均價'].astype('float')==avg_price.max()]
# 將未有小區建成時間的數據字段直接剔除(2個) need_data[need_data.小區建成=='暫無數據']
# 剔除小區建成時間為暫無數據的兩條數據 try: need_data = need_data.drop([1931,2527]) except: print('數據已經剔除!!!') need_data[need_data.小區建成=='暫無數據'] # 將小區建成時間轉成日期並僅提取其中的年份 built_year = pd.to_datetime(need_data.小區建成).dt.year # 繪制小區建成年限與小區均價的散點分布圖 plt.scatter(built_year,need_data['小區均價'].astype(float),s=6) plt.title('北京二手房小區建成年份與均價分布信息!',fontsize=15) plt.xlabel('小區建成年份',fontsize=15) plt.ylabel('房屋均價',fontsize=15) plt.grid(linestyle=":", color="r") plt.xticks(rotation=0) plt.show()
# 分析房屋的產權形式(得出結論有兩種) need_data['產權形式'].value_counts()
need_data['樓層'].value_counts() need_data[need_data['樓層']=='未知('] try: need_louceng_data = need_data.drop(1340) except: print('樓層未知的已刪除!') need_louceng_data[need_louceng_data['樓層']=='未知('] plt.figure(figsize=(7,7)) need_louceng_data['樓層'].value_counts().plot(kind='pie',autopct='%1.1f%%',shadow=False,startangle=150) plt.title('北京二手房樓層房源數量分布圖',fontsize=15) plt.xticks(rotation=30) plt.grid(linestyle=":", color="g") plt.show()
avg_price_louceng = need_louceng_data.groupby('樓層').mean()['單價'] avg_price_louceng.plot(kind='bar',color=['g','r','y','b'],alpha=0.5) plt.title('北京二手房樓層與房屋均價分布信息!',fontsize=15) plt.xlabel('樓層信息',fontsize=15) plt.ylabel('房屋均價',fontsize=15) plt.grid(linestyle=":", color="g",alpha=0.4) plt.legend() plt.xticks(rotation=0) plt.show()
五,總結
這次的主題爬蟲爬的是北京貝殼找房網的網站,相對來說進行的還是比較不順利的,該網站設置了反爬限制。在數據的可視化上,大部分都還好,想要的也達到了我的預期效果。現在很多網站是用JSON存儲數據或者用JS動態加載數據的,因此之后會多學習這些方面的知識。