基於python廈門思明區二手房價分析和預測

本文轉載自查看原文 2021-06-25 03:25 152

　　　　　　　　　　　　基於python廈門思明區二手房價分析和構建基於機器學習的房價預測模型

一，選題背景

　　網上有條段子，某地房價5w每平，月收入剛好過萬，掐指一算，命中注定買房是不可能的，這輩子都不可能買房，所以要定個小目標：“我真的還想再活500年······”。當然，房子雖貴，但是我可以學學科學的方法了解房價趨勢，做到心中有數，萬一買的起呢？

二，設計方案

1，爬蟲名稱：基於python廈門思明區二手房價分析和預測

2，爬蟲爬取的內容與數據特征分析：內容主要是：房屋信息；數據特征：房屋信息的歸一化與存儲

3，設計方案概述：房屋信息的爬取並歸一化與存儲，進行數據預處理與數據可視化，構建房價預測模型進行多特征模型訓練，得到構建基於機器學習的房價預測模型結果

三，結果特征分析

1，利用網絡爬蟲獲取廈門市思明區二手房信息

四，爬蟲程序設計

1，數據爬取與采集

# 使用 HTML.parser 解析器

1 import requests
2 from bs4 import BeautifulSoup
3 # requests返回網頁內容
4 res = requests.get(r'https://xm.esf.fang.com/house-a0352/')
5 #res.text
6 
7 # BeautifulSoup解析網頁
8 soup = BeautifulSoup(res.text,'html.parser') # 使用 HTML.parser 解析器

 1 def get_house(url):
 2     '獲取頁面中每個房子的信息'
 3     information = {} # 存儲房屋所有信息
 4     res = requests.get(url)
 5     soup = BeautifulSoup(res.text,'html.parser')
 6     
 7     # 獲取戶型、建築面積、單價、朝向、樓層、裝修情況
 8     houses = soup.select('.tab-cont-right .trl-item1') 
 9     #print(houses)
10     for house in houses:
11         me = house.text.strip().split('\n')
12         information[me[1]] = me[0].strip()
13     
14     # 獲取小區名字
15     name = soup.select('.rcont .blue')
16     information['小區名稱'] = name[0].text
17     
18     # 獲取房屋總價
19     price = soup.select('.trl-item')
20     information['房屋總價'] = price[0].text
21     print(information)
22 
23 # 函數測試
24 get_house("https://xm.esf.fang.com/chushou/3_346676334.htm?channel=1,2&psid=1_15_60") # 每一頁中房子的url(注意：鏈接可能失效)

 1 def get_page(n):
 2     '分頁爬取數據'
 3     for i in range(1,n+1): # n：爬取頁數
 4         url = r'https://xm.esf.fang.com/house-a0352/i3{}/'.format(i) # 總共多少頁
 5         res = requests.get(url)
 6         houses = BeautifulSoup(res.text,'html.parser')
 7         print(url)
 8         j  = 1
 9         houses = houses.select('.shop_list .clearfix h4 a') # 獲取每套房子的href
10         for house in houses:
11             try:
12                 demo_url = house['href']
13                 url = r'https://xm.esf.fang.com' + demo_url + '?channel=1,2&psid=1_{}_60'.format(j) # 每一頁中有多少套房子
14                 # 獲取當前頁面中每套房子的信息
15                 #get_house(url) 
16                 j += 1
17             except Exception as e:
18                 print('-------->',e)
19 get_page(1)

房屋信息的歸一化與存儲

 1 import requests
 2 from bs4 import BeautifulSoup
 3 
 4 def get_house(url):
 5     '獲取頁面中每個房子的信息'
 6     information = {} # 存儲房屋所有信息
 7     res = requests.get(url)
 8     soup = BeautifulSoup(res.text,'html.parser')
 9     
10     # 獲取戶型、建築面積、單價、朝向、樓層、裝修情況
11     houses = soup.select('.tab-cont-right .trl-item1') 
12     for house in houses:
13         m = house.text.strip().split('\n')
14         me = m[1]
15         if '朝向' in me:
16             me = me.strip('進門') 
17         if '樓層'in me:
18             me = me[0:2]
19         if '地上層數' in me:
20             me = '樓層'
21         if '裝修程度' in me:
22             me = '裝修'
23         information[me] = m[0].strip()
24     
25     # 獲取小區名字
26     name = soup.select('.rcont .blue')
27     information['小區名稱'] = name[0].text
28     
29     # 獲取房屋總價
30     price = soup.select('.trl-item')
31     information['房屋總價'] = price[0].text
32     return information
33 
34 # 函數測試
35 get_house("https://xm.esf.fang.com/chushou/3_346676334.htm?channel=1,2&psid=1_15_60") # 每一頁中房子的url

 1 import pandas as pd
 2 import time
 3 
 4 def get_page(i):
 5     '分頁爬取數據'
 6     url = r'https://xm.esf.fang.com/house-a0352/i3{}/'.format(i) # 總共多少頁
 7     res = requests.get(url)
 8     houses = BeautifulSoup(res.text,'html.parser')
 9     #print(url)
10     j  = 1
11     houses = houses.select('.shop_list .clearfix h4 a')
12     page_information = [] # 數據存儲
13     for house in houses:
14         try:
15             demo_url = house['href']
16             url = r'https://xm.esf.fang.com' + demo_url + '?channel=1,2&psid=1_{}_60'.format(j) # 每一頁中有多少套房子
17             # 獲取當前頁中每套房子的信息
18             information = get_house(url)
19             print('正在爬取第{}頁第{}套房子···'.format(i,j),end='\r')
20             page_information.append(information)
21             j += 1
22             time.sleep(0.5) # 預防爬取頻繁，防止ip被封
23         except Exception as e:
24             print('-------->',e)
25     #將爬取的數據轉換為DataFrame格式
26     df = pd.DataFrame(page_information)
27     #df.to_csv('house_information.csv')
28     return df
29 #get_page(1)

 1 # 正式爬取數據並保存為csv數據
 2 
 3 df = pd.DataFrame() # 創建一個空的DataFrame
 4 name_csv = 'house_information_'
 5 for i in range(1,101): # 總共爬取100頁數據
 6     try:
 7         df_get = get_page(i)
 8         df = df.append(df_get)
 9         print(df)
10     except Exception as e:
11         print('------->',e)
12     if i/100 == 1:
13         df.to_csv(name_csv+str(i)+'.csv')
14         df = pd.DataFrame() # 清空當前爬取完成的數據，防止內存溢出

2、數據預處理

1 import numpy as np
2 import pandas as pd
3 
4 data = pd.read_csv(r'house_information.csv') # 如果用pandas打不開數據，可以使用記事本打開把編碼格式改成utf-8另存
5 data.head()

1 data.drop('index',axis=1,inplace=True) # 刪除index列（用del更方便）
2 data.head()

1 # Series的extract支持正則匹配抽取，返回的值是字符串
2 data[['室','廳','衛']] = data['戶型'].str.extract(r'(\d+)室(\d+)廳(\d+)衛')
3 
4 # 把字符串格式轉化為float，並刪除戶型
5 data['室'] = data['室'].astype(float)
6 data['廳'] = data['廳'].astype(float)
7 data['衛'] = data['衛'].astype(float)
8 del data['戶型']
9 data.head()

1 # 將建築面積后的平方米去除，並將數據類型改成浮點型
2 data['建築面積'] = data['建築面積'].map(lambda e:e.replace('平米',''))# Series中的map
3 data['建築面積'] = data['建築面積'].astype(float)
4 data.head()

1 # 將單價后的元/平米去除，並將數據類型改成浮點型
2 data['單價'] = data['單價'].map(lambda e:e.replace(r'元/平米',''))
3 data['單價'] = data['單價'].astype(float)
4 data.head()

1 # 將房屋總價后的萬去除，並將數據類型改成浮點型
2 data['房屋總價'] = data['房屋總價'].map(lambda e:e.replace('萬',''))
3 data['房屋總價'] = data['房屋總價'].astype(float)
4 data.head()

1 # 使用pd.get_dummies() 量化數據
2 data_direction = pd.get_dummies(data['朝向'])
3 data_direction.head()

1 # 使用pd.get_dummies() 量化數據
2 data_floor = pd.get_dummies(data['樓層'])
3 data_floor.head()

1 # 使用pd.get_dummies() 量化數據
2 data_decoration = pd.get_dummies(data['裝修'])
3 data_decoration.head()

1 # 使用pd.concat矩陣拼接，axis=1：水平拼接
2 data = pd.concat([data,data_direction,data_floor,data_decoration],axis=1)

1 # 拼接后的列名
2 data.columns

 1 # 特征帥選
 2 del data['小區名稱']
 3 del data['朝向']
 4 del data['樓層']
 5 del data['裝修']
 6 del data['東西']
 7 del data['南北']
 8 del data['暫無'] # 兩列都刪除
 9 del data['中層'] # 多重共線性問題（線性回歸）
10 del data['中裝修']
11 data.columns

1 data.head()

1 data.info() # 發現 室廳衛中 有缺失值

3、數據可視化

1 import matplotlib.pyplot as plt
2 area = data['建築面積']
3 price = data['房屋總價']
4 plt.scatter(area,price)
5 plt.show() # 有離群點數據，對線性分析不利，需要過濾

1 df = data[data['建築面積'] <=300] # 正常住宅面積小於等於300平米
2 area = df['建築面積']
3 price = df['房屋總價']
4 #print(area.count()) #過濾后的數據量
5 plt.scatter(area,price)
6 plt.xlabel("area")
7 plt.ylabel("price")
8 plt.show()

4、構建房價預測模型

1 # 先根據建築面積和房屋總價訓練模型（一元線性回歸）
2 from sklearn.linear_model import LinearRegression
3 linear = LinearRegression()
4 area = np.array(area).reshape(-1,1) # 這里需要注意新版的sklearn需要將數據轉換為矩陣才能進行計算
5 price = np.array(price).reshape(-1,1)
6 # 訓練模型
7 model = linear.fit(area,price)
8 # 打印截距和回歸系數
9 print(model.intercept_, model.coef_)

1 # 線性回歸可視化(數據擬合)
2 linear_p = model.predict(area)
3 plt.figure(figsize=(12,6))
4 plt.scatter(area,price)
5 plt.plot(area,linear_p,'red')
6 plt.xlabel("area")
7 plt.ylabel("price")
8 plt.show()

多特征模型訓練

1 cols = ['建築面積','室', '廳', '衛', '東', '東北', '東南', '北', '南', '西',
2        '西北', '西南', '低層', '高層', '毛坯', '簡裝修', '精裝修', '豪華裝修']

1 X = df[cols]
2 X.head()

1 y = df['房屋總價']
2 y.head()

1 print(type(X))
2 print(type(y))
3 # 使用train_test_split進行交叉驗證
4 from sklearn.model_selection import train_test_split
5 x_train, x_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=12)
6 print(x_train.shape,y_train.shape)
7 print(x_test.shape,y_test.shape)

1 # 模型訓練
2 linear = LinearRegression()
3 model = linear.fit(x_train,y_train)
4 print(model.intercept_, model.coef_)

1 # 模型性能評分
2 price_end = model.predict(x_test)
3 score = model.score(x_test,y_test) 
4 print("模型得分：",score)# 一般模型在0.6以上就表現的不錯

使用多種特征組合都可以預測房價，那么怎么找出最佳組合，這里使用假設驗證法，選出最佳特征組合

 1 # 使用假設驗證法，選出最佳特征組合
 2 cols = ['建築面積','室', '廳', '衛', '東', '東北', '東南', '北', '南', '西',
 3        '西北', '西南', '低層', '高層', '毛坯', '簡裝修', '精裝修', '豪華裝修']
 4 import statsmodels.api as sm
 5 Y = df['房屋總價']
 6 X = df[cols]
 7 X_ = sm.add_constant(X) #增加一列值為1的const列，保證偏置項的正常
 8 #print(X_)
 9 # 使用最小平方法
10 result = sm.OLS(Y,X_)
11 # 使用fit方法進行計算
12 summary = result.fit()
13 # 調用summary2方法打印出假設驗證信息（性能指標）
14 summary.summary2() # R-squared:模型評分 AIC：組合完越小越好

#特征超過16個將發生異常

1 import itertools
2 
3 list1 = [1, 2,3, 4, 5,6,7,8,9,10,11,12,13,14,15,16] 
4 list2 = []
5 for i in range(1, len(list1)+1):
6     iter1 = itertools.combinations(list1, i)
7     list2.append(list(iter1))
8 #print(list2)

# 使用itertools，找出AIC最小值的特征組合作為模型訓練的特征
# 尋找最小AIC值的特征組合

1 import itertools
2 fileds = ['建築面積','室', '廳', '衛', '東','北', '南', '西','低層', '高層', '毛坯', '簡裝修', '精裝修', '豪華裝修']
3 acis = {}
4 for i in range(1,len(fileds)+1):
5     for virables in itertools.combinations(fileds,i): #從fileds中隨機選擇i個特征機型組合，返回的virables為元組類型
6         x1 = sm.add_constant(df[list(virables)])
7         x2 = sm.OLS(Y,x1)
8         res = x2.fit()
9         acis[virables] = res.aic # AIC評分越小越好

1 from collections import Counter
2 # 對字典進行統計
3 counter = Counter(acis)
4 # 降序選出AIC最小的10個數，也就是最佳特征組合
5 counter.most_common()[-10:]

1 # 接下來使用AIC值最小的特征組合進行預測
2 col2 = ['建築面積', '室', '廳', '東', '南', '高層', '毛坯', '精裝修', '豪華裝修']
3 X = df[col2]
4 y = df['房屋總價']
5 x_train, x_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=13)
6 linear = LinearRegression()
7 model = linear.fit(x_train,y_train)
8 model.score(x_test,y_test) # 模型性能有所提高，但是提升的不明顯

5、房價的預測

現在我可以根據給定的最佳特征組合進行預測房價

1 # 假設我要買一套房子（想想就覺得很美），房子面積120平米，3室，1廳，南面，高層，精裝修
2 my_house = [120,3,1,0,1,1,0,1,0] #根據col2特征
3 my_house = np.array(my_house).reshape(-1,1).T
4 #print(x_test)
5 model.predict(my_house)# 預測價格

完整代碼：

  1 import requests
  2 from bs4 import BeautifulSoup
  3 # requests返回網頁內容
  4 res = requests.get(r'https://xm.esf.fang.com/house-a0352/')
  5 #res.text
  6 
  7 # BeautifulSoup解析網頁
  8 soup = BeautifulSoup(res.text,'html.parser') # 使用 HTML.parser 解析器
  9 
 10 
 11 
 12 def get_house(url):
 13     '獲取頁面中每個房子的信息'
 14     information = {} # 存儲房屋所有信息
 15     res = requests.get(url)
 16     soup = BeautifulSoup(res.text,'html.parser')
 17     
 18     # 獲取戶型、建築面積、單價、朝向、樓層、裝修情況
 19     houses = soup.select('.tab-cont-right .trl-item1') 
 20     #print(houses)
 21     for house in houses:
 22         me = house.text.strip().split('\n')
 23         information[me[1]] = me[0].strip()
 24     
 25     # 獲取小區名字
 26     name = soup.select('.rcont .blue')
 27     information['小區名稱'] = name[0].text
 28     
 29     # 獲取房屋總價
 30     price = soup.select('.trl-item')
 31     information['房屋總價'] = price[0].text
 32     print(information)
 33 
 34 # 函數測試
 35 get_house("https://xm.esf.fang.com/chushou/3_346676334.htm?channel=1,2&psid=1_15_60") # 每一頁中房子的url(注意：鏈接可能失效)
 36 
 37 
 38 
 39 def get_page(n):
 40     '分頁爬取數據'
 41     for i in range(1,n+1): # n：爬取頁數
 42         url = r'https://xm.esf.fang.com/house-a0352/i3{}/'.format(i) # 總共多少頁
 43         res = requests.get(url)
 44         houses = BeautifulSoup(res.text,'html.parser')
 45         print(url)
 46         j  = 1
 47         houses = houses.select('.shop_list .clearfix h4 a') # 獲取每套房子的href
 48         for house in houses:
 49             try:
 50                 demo_url = house['href']
 51                 url = r'https://xm.esf.fang.com' + demo_url + '?channel=1,2&psid=1_{}_60'.format(j) # 每一頁中有多少套房子
 52                 # 獲取當前頁面中每套房子的信息
 53                 #get_house(url) 
 54                 j += 1
 55             except Exception as e:
 56                 print('-------->',e)
 57 get_page(1)
 58 
 59 
 60 
 61 import requests
 62 from bs4 import BeautifulSoup
 63 
 64 def get_house(url):
 65     '獲取頁面中每個房子的信息'
 66     information = {} # 存儲房屋所有信息
 67     res = requests.get(url)
 68     soup = BeautifulSoup(res.text,'html.parser')
 69     
 70     # 獲取戶型、建築面積、單價、朝向、樓層、裝修情況
 71     houses = soup.select('.tab-cont-right .trl-item1') 
 72     for house in houses:
 73         m = house.text.strip().split('\n')
 74         me = m[1]
 75         if '朝向' in me:
 76             me = me.strip('進門') 
 77         if '樓層'in me:
 78             me = me[0:2]
 79         if '地上層數' in me:
 80             me = '樓層'
 81         if '裝修程度' in me:
 82             me = '裝修'
 83         information[me] = m[0].strip()
 84     
 85     # 獲取小區名字
 86     name = soup.select('.rcont .blue')
 87     information['小區名稱'] = name[0].text
 88     
 89     # 獲取房屋總價
 90     price = soup.select('.trl-item')
 91     information['房屋總價'] = price[0].text
 92     return information
 93 
 94 # 函數測試
 95 get_house("https://xm.esf.fang.com/chushou/3_346676334.htm?channel=1,2&psid=1_15_60") # 每一頁中房子的url
 96 
 97 
 98 
 99 import pandas as pd
100 import time
101 
102 def get_page(i):
103     '分頁爬取數據'
104     url = r'https://xm.esf.fang.com/house-a0352/i3{}/'.format(i) # 總共多少頁
105     res = requests.get(url)
106     houses = BeautifulSoup(res.text,'html.parser')
107     #print(url)
108     j  = 1
109     houses = houses.select('.shop_list .clearfix h4 a')
110     page_information = [] # 數據存儲
111     for house in houses:
112         try:
113             demo_url = house['href']
114             url = r'https://xm.esf.fang.com' + demo_url + '?channel=1,2&psid=1_{}_60'.format(j) # 每一頁中有多少套房子
115             # 獲取當前頁中每套房子的信息
116             information = get_house(url)
117             print('正在爬取第{}頁第{}套房子···'.format(i,j),end='\r')
118             page_information.append(information)
119             j += 1
120             time.sleep(0.5) # 預防爬取頻繁，防止ip被封
121         except Exception as e:
122             print('-------->',e)
123     #將爬取的數據轉換為DataFrame格式
124     df = pd.DataFrame(page_information)
125     #df.to_csv('house_information.csv')
126     return df
127 #get_page(1)
128 
129 
130 
131 
132 # 正式爬取數據並保存為csv數據
133 
134 df = pd.DataFrame() # 創建一個空的DataFrame
135 name_csv = 'house_information_'
136 for i in range(1,101): # 總共爬取100頁數據
137     try:
138         df_get = get_page(i)
139         df = df.append(df_get)
140         print(df)
141     except Exception as e:
142         print('------->',e)
143     if i/100 == 1:
144         df.to_csv(name_csv+str(i)+'.csv')
145         df = pd.DataFrame() # 清空當前爬取完成的數據，防止內存溢出
146 
147 
148 
149 import numpy as np
150 import pandas as pd
151 
152 data = pd.read_csv(r'house_information.csv') # 如果用pandas打不開數據，可以使用記事本打開把編碼格式改成utf-8另存
153 data.head()
154 
155 
156 data.drop('index',axis=1,inplace=True) # 刪除index列（用del更方便）
157 data.head()
158 
159 
160 # Series的extract支持正則匹配抽取，返回的值是字符串
161 data[['室','廳','衛']] = data['戶型'].str.extract(r'(\d+)室(\d+)廳(\d+)衛')
162 
163 
164 # 把字符串格式轉化為float，並刪除戶型
165 data['室'] = data['室'].astype(float)
166 data['廳'] = data['廳'].astype(float)
167 data['衛'] = data['衛'].astype(float)
168 del data['戶型']
169 data.head()
170 
171 
172 
173 # 將建築面積后的平方米去除，並將數據類型改成浮點型
174 data['建築面積'] = data['建築面積'].map(lambda e:e.replace('平米',''))# Series中的map
175 data['建築面積'] = data['建築面積'].astype(float)
176 data.head()
177 
178 
179 
180 # 將單價后的元/平米去除，並將數據類型改成浮點型
181 data['單價'] = data['單價'].map(lambda e:e.replace(r'元/平米',''))
182 data['單價'] = data['單價'].astype(float)
183 data.head()
184 
185 
186 
187 # 將房屋總價后的萬去除，並將數據類型改成浮點型
188 data['房屋總價'] = data['房屋總價'].map(lambda e:e.replace('萬',''))
189 data['房屋總價'] = data['房屋總價'].astype(float)
190 data.head()
191 
192 
193 
194 # 使用pd.get_dummies() 量化數據
195 data_direction = pd.get_dummies(data['朝向'])
196 data_direction.head()
197 
198 
199 
200 # 使用pd.get_dummies() 量化數據
201 data_floor = pd.get_dummies(data['樓層'])
202 data_floor.head()
203 
204 
205 
206 # 使用pd.get_dummies() 量化數據
207 data_decoration = pd.get_dummies(data['裝修'])
208 data_decoration.head()
209 
210 
211 
212 # 使用pd.concat矩陣拼接，axis=1：水平拼接
213 data = pd.concat([data,data_direction,data_floor,data_decoration],axis=1) 
214 
215 
216 
217 # 拼接后的列名
218 data.columns
219 
220 
221 # 特征帥選
222 del data['小區名稱']
223 del data['朝向']
224 del data['樓層']
225 del data['裝修']
226 del data['東西']
227 del data['南北']
228 del data['暫無'] # 兩列都刪除
229 del data['中層'] # 多重共線性問題（線性回歸）
230 del data['中裝修']
231 data.columns
232 
233 
234 
235 data.head()
236 
237 data.info() # 發現 室廳衛中 有缺失值
238 
239 
240 # 刪除缺失值
241 data.dropna(inplace=True)
242 data.info() 
243 
244 
245 import matplotlib.pyplot as plt
246 area = data['建築面積']
247 price = data['房屋總價']
248 plt.scatter(area,price)
249 plt.show() # 有離群點數據，對線性分析不利，需要過濾
250 
251 
252 
253 df = data[data['建築面積'] <=300] # 正常住宅面積小於等於300平米
254 area = df['建築面積']
255 price = df['房屋總價']
256 #print(area.count()) #過濾后的數據量
257 plt.scatter(area,price)
258 plt.xlabel("area")
259 plt.ylabel("price")
260 plt.show()
261 
262 
263 
264 
265 # 先根據建築面積和房屋總價訓練模型（一元線性回歸）
266 from sklearn.linear_model import LinearRegression
267 linear = LinearRegression()
268 area = np.array(area).reshape(-1,1) # 這里需要注意新版的sklearn需要將數據轉換為矩陣才能進行計算
269 price = np.array(price).reshape(-1,1)
270 # 訓練模型
271 model = linear.fit(area,price)
272 # 打印截距和回歸系數
273 print(model.intercept_, model.coef_)
274 
275 
276 
277 # 線性回歸可視化(數據擬合)
278 linear_p = model.predict(area)
279 plt.figure(figsize=(12,6))
280 plt.scatter(area,price)
281 plt.plot(area,linear_p,'red')
282 plt.xlabel("area")
283 plt.ylabel("price")
284 plt.show()
285 
286 
287 
288 
289 cols = ['建築面積','室', '廳', '衛', '東', '東北', '東南', '北', '南', '西',
290        '西北', '西南', '低層', '高層', '毛坯', '簡裝修', '精裝修', '豪華裝修']
291 
292 X = df[cols]
293 X.head()
294 
295 y = df['房屋總價']
296 y.head()
297 
298 print(type(X))
299 print(type(y))
300 # 使用train_test_split進行交叉驗證
301 from sklearn.model_selection import train_test_split
302 x_train, x_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=12)
303 print(x_train.shape,y_train.shape)
304 print(x_test.shape,y_test.shape)
305 
306 
307 
308 # 模型訓練
309 linear = LinearRegression()
310 model = linear.fit(x_train,y_train)
311 print(model.intercept_, model.coef_)
312 
313 
314 
315 # 模型性能評分
316 price_end = model.predict(x_test)
317 score = model.score(x_test,y_test) 
318 print("模型得分：",score)# 一般模型在0.6以上就表現的不錯
319 
320 
321 
322 # 使用假設驗證法，選出最佳特征組合
323 cols = ['建築面積','室', '廳', '衛', '東', '東北', '東南', '北', '南', '西',
324        '西北', '西南', '低層', '高層', '毛坯', '簡裝修', '精裝修', '豪華裝修']
325 import statsmodels.api as sm
326 Y = df['房屋總價']
327 X = df[cols]
328 X_ = sm.add_constant(X) #增加一列值為1的const列，保證偏置項的正常
329 #print(X_)
330 # 使用最小平方法
331 result = sm.OLS(Y,X_)
332 # 使用fit方法進行計算
333 summary = result.fit()
334 # 調用summary2方法打印出假設驗證信息（性能指標）
335 summary.summary2() # R-squared:模型評分 AIC：組合完越小越好
336 
337 
338 
339 
340 import itertools
341 
342 list1 = [1, 2,3, 4, 5,6,7,8,9,10,11,12,13,14,15,16] 
343 list2 = []
344 for i in range(1, len(list1)+1):
345     iter1 = itertools.combinations(list1, i)
346     list2.append(list(iter1))
347 #print(list2)
348 
349 
350 
351 
352 import itertools
353 fileds = ['建築面積','室', '廳', '衛', '東','北', '南', '西','低層', '高層', '毛坯', '簡裝修', '精裝修', '豪華裝修']
354 acis = {}
355 for i in range(1,len(fileds)+1):
356     for virables in itertools.combinations(fileds,i): #從fileds中隨機選擇i個特征機型組合，返回的virables為元組類型
357         x1 = sm.add_constant(df[list(virables)])
358         x2 = sm.OLS(Y,x1)
359         res = x2.fit()
360         acis[virables] = res.aic # AIC評分越小越好
361 
362 
363 
364 from collections import Counter
365 # 對字典進行統計
366 counter = Counter(acis)
367 # 降序選出AIC最小的10個數，也就是最佳特征組合
368 counter.most_common()[-10:] 
369 
370 
371 
372 
373 # 接下來使用AIC值最小的特征組合進行預測
374 col2 = ['建築面積', '室', '廳', '東', '南', '高層', '毛坯', '精裝修', '豪華裝修']
375 X = df[col2]
376 y = df['房屋總價']
377 x_train, x_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=13)
378 linear = LinearRegression()
379 model = linear.fit(x_train,y_train)
380 model.score(x_test,y_test) # 模型性能有所提高，但是提升的不明顯
381 
382 
383 
384 
385 # 假設我要買一套房子（想想就覺得很美），房子面積120平米，3室，1廳，南面，高層，精裝修
386 my_house = [120,3,1,0,1,1,0,1,0] #根據col2特征
387 my_house = np.array(my_house).reshape(-1,1).T
388 #print(x_test)
389 model.predict(my_house)# 預測價格

五，總結

　　經過對數據的分析與可視化，可以得到：使用假設驗證法，選出最佳特征組合，打印出假設驗證性能指標，對預期的目標已經達到了，這設計多特征模型訓練機器學習的模型結果表現不錯。

免責聲明！

本站轉載的文章為個人學習借鑒使用，本站對版權不負任何法律責任。如果侵犯了您的隱私權益，請聯系本站郵箱yoyou2525@163.com刪除。

猜您在找 Python數據分析——上海市二手房價格分析 Python某地區二手房房價數據分析 python鏈家二手房分析 Python爬取二手房源數據，可視化分析二手房市場行情數據南京二手房成交數據分析 Python爬蟲 —3000+條北京二手房數據可視化分析基於Python的南京二手房數據可視化分析 Python網絡爬蟲——二手房數據爬取及分析分析及可視化二手房信息 python 爬取鏈家二手房信息