本文重點
- 系統分析網頁性質
- 結構化的數據解析
- csv數據保存
- python 3.8
- pycharm 專業版 >>> 激活碼
#模塊使用
- requests >>> pip install requests
- parsel >>> pip install parsel
- csv
import requests # 數據請求模塊 第三方模塊 pip install requests import parsel # 數據解析模塊 import re import csv
url = 'https://bj.lianjia.com/ershoufang/pg1/' # 需要攜帶上 請求頭: 把python代碼偽裝成瀏覽器 對於服務器發送請求 # User-Agent 瀏覽器的基本信息 headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.61 Safari/537.36' } response = requests.get(url=url, headers=headers)
print(response.text)
selector_1 = parsel.Selector(response.text) # 把獲取到response.text 數據內容轉成 selector 對象 href = selector_1.css('div.leftContent li div.title a::attr(href)').getall() for link in href: html_data = requests.get(url=link, headers=headers).text selector = parsel.Selector(html_data) # css選擇器 語法 # try: title = selector.css('.title h1::text').get() # 標題 area = selector.css('.areaName .info a:nth-child(1)::text').get() # 區域 community_name = selector.css('.communityName .info::text').get() # 小區 room = selector.css('.room .mainInfo::text').get() # 戶型 room_type = selector.css('.type .mainInfo::text').get() # 朝向 height = selector.css('.room .subInfo::text').get().split('/')[-1] # 樓層 # 中樓層/共5層 split('/') 進行字符串分割 ['中樓層', '共5層'] [-1] # ['中樓層', '共5層'][-1] 列表索引位置取值 取列表中最后一個元素 共5層 # re.findall('共(\d+)層', 共5層) >>> [5][0] >>> 5 height = re.findall('共(\d+)層', height)[0] sub_info = selector.css('.type .subInfo::text').get().split('/')[-1] # 裝修 Elevator = selector.css('.content li:nth-child(12)::text').get() # 電梯 # if Elevator == '暫無數據電梯' or Elevator == None: # Elevator = '無電梯' house_area = selector.css('.content li:nth-child(3)::text').get().replace('㎡', '') # 面積 price = selector.css('.price .total::text').get() # 價格(萬元) date = selector.css('.area .subInfo::text').get().replace('年建', '') # 年份 dit = { '標題': title, '市區': area, '小區': community_name, '戶型': room, '朝向': room_type, '樓層': height, '裝修情況': sub_info, '電梯': Elevator, '面積(㎡)': house_area, '價格(萬元)': price, '年份': date, } csv_writer.writerow(dit) print(title, area, community_name, room, room_type, height, sub_info, Elevator, house_area, price, date, sep='|')
f = open('二手房數據.csv', mode='a', encoding='utf-8', newline='') csv_writer = csv.DictWriter(f, fieldnames=[ '標題', '市區', '小區', '戶型', '朝向', '樓層', '裝修情況', '電梯', '面積(㎡)', '價格(萬元)', '年份', ]) csv_writer.writeheader()
import pandas as pd from pyecharts.charts import Map from pyecharts.charts import Bar from pyecharts.charts import Line from pyecharts.charts import Grid from pyecharts.charts import Pie from pyecharts.charts import Scatter from pyecharts import options as opts
df = pd.read_csv('鏈家.csv', encoding = 'utf-8') df.head()
new = [x + '區' for x in region] m = ( Map() .add('', [list(z) for z in zip(new, count)], '北京') .set_global_opts( title_opts=opts.TitleOpts(title='北京市二手房各區分布'), visualmap_opts=opts.VisualMapOpts(max_=3000), ) ) m.render_notebook()
df_price.values.tolist() price = [round(x,2) for x in df_price.values.tolist()] bar = ( Bar() .add_xaxis(region) .add_yaxis('數量', count, label_opts=opts.LabelOpts(is_show=True)) .extend_axis( yaxis=opts.AxisOpts( name="價格(萬元)", type_="value", min_=200, max_=900, interval=100, axislabel_opts=opts.LabelOpts(formatter="{value}"), ) ) .set_global_opts( title_opts=opts.TitleOpts(title='各城區二手房數量-平均價格柱狀圖'), tooltip_opts=opts.TooltipOpts( is_show=True, trigger="axis", axis_pointer_type="cross" ), xaxis_opts=opts.AxisOpts( type_="category", axispointer_opts=opts.AxisPointerOpts(is_show=True, type_="shadow"), ), yaxis_opts=opts.AxisOpts(name='數量', axistick_opts=opts.AxisTickOpts(is_show=True), splitline_opts=opts.SplitLineOpts(is_show=False),) ) ) line2 = ( Line() .add_xaxis(xaxis_data=region) .add_yaxis( series_name="價格", yaxis_index=1, y_axis=price, label_opts=opts.LabelOpts(is_show=True), z=10 ) ) bar.overlap(line2) grid = Grid() grid.add(bar, opts.GridOpts(pos_left="5%", pos_right="20%"), is_control_axis_index=True) grid.render_notebook()
area0 = top_price['小區'].values.tolist() count = top_price['價格(萬元)'].values.tolist() bar = ( Bar() .add_xaxis(area0) .add_yaxis('數量', count,category_gap = '50%') .set_global_opts( yaxis_opts=opts.AxisOpts(name='價格(萬元)'), xaxis_opts=opts.AxisOpts(name='數量'), ) ) bar.render_notebook()
s = ( Scatter() .add_xaxis(df['面積(㎡)'].values.tolist()) .add_yaxis('',df['價格(萬元)'].values.tolist()) .set_global_opts(xaxis_opts=opts.AxisOpts(type_='value')) ) s.render_notebook()
directions = df_direction.index.tolist() count = df_direction.values.tolist() c1 = ( Pie(init_opts=opts.InitOpts( width='800px', height='600px', ) ) .add( '', [list(z) for z in zip(directions, count)], radius=['20%', '60%'], center=['40%', '50%'], # rosetype="radius", label_opts=opts.LabelOpts(is_show=True), ) .set_global_opts(title_opts=opts.TitleOpts(title='房屋朝向占比',pos_left='33%',pos_top="5%"), legend_opts=opts.LegendOpts(type_="scroll", pos_left="80%",pos_top="25%",orient="vertical") ) .set_series_opts(label_opts=opts.LabelOpts(formatter='{b}:{c} ({d}%)'),position="outside") ) c1.render_notebook()
fitment = df_fitment.index.tolist() count1 = df_fitment.values.tolist() directions = df_direction.index.tolist() count2 = df_direction.values.tolist() bar = ( Bar() .add_xaxis(fitment) .add_yaxis('', count1, category_gap = '50%') .reversal_axis() .set_series_opts(label_opts=opts.LabelOpts(position='right')) .set_global_opts( xaxis_opts=opts.AxisOpts(name='數量'), title_opts=opts.TitleOpts(title='裝修情況/有無電梯玫瑰圖(組合圖)',pos_left='33%',pos_top="5%"), legend_opts=opts.LegendOpts(type_="scroll", pos_left="90%",pos_top="58%",orient="vertical") ) ) c2 = ( Pie(init_opts=opts.InitOpts( width='800px', height='600px', ) ) .add( '', [list(z) for z in zip(directions, count2)], radius=['10%', '30%'], center=['75%', '65%'], rosetype="radius", label_opts=opts.LabelOpts(is_show=True), ) .set_global_opts(title_opts=opts.TitleOpts(title='有/無電梯',pos_left='33%',pos_top="5%"), legend_opts=opts.LegendOpts(type_="scroll", pos_left="90%",pos_top="15%",orient="vertical") ) .set_series_opts(label_opts=opts.LabelOpts(formatter='{b}:{c} \n ({d}%)'),position="outside") ) bar.overlap(c2) bar.render_notebook()

floor = df_floor.index.tolist() count = df_floor.values.tolist() bar = ( Bar() .add_xaxis(floor) .add_yaxis('數量', count) .set_global_opts( title_opts=opts.TitleOpts(title='二手房樓層分布柱狀縮放圖'), yaxis_opts=opts.AxisOpts(name='數量'), xaxis_opts=opts.AxisOpts(name='樓層'), datazoom_opts=opts.DataZoomOpts(type_='slider') ) ) bar.render_notebook()
area = df_area.index.tolist() count = df_area.values.tolist() bar = ( Bar() .add_xaxis(area) .add_yaxis('數量', count) .reversal_axis() .set_series_opts(label_opts=opts.LabelOpts(position="right")) .set_global_opts( title_opts=opts.TitleOpts(title='房屋面積分布縱向柱狀圖'), yaxis_opts=opts.AxisOpts(name='面積(㎡)'), xaxis_opts=opts.AxisOpts(name='數量'), ) ) bar.render_notebook()