一、前言:
安居客、鏈家和房天下是目前網上可以獲取小區數據較為精准的網站,之前已經發過鏈家和房天下的部分區域(僅浦東)獲取攻略。這次因為工作原因,需要獲取整個上海的所有小區數據(僅別墅和住宅),所以過年這幾天在不斷的數據分析、獲取、清洗和驗證。特此記錄一下,也把代碼和各位分享。
二、爬取思路:
不管是安居客、鏈家還是房天下,獲取數據的思路都是一致的:
1、獲取不同行政區的網址
2、獲取不同行政區下不同商圈/街鎮的網址
3、獲取不同行政區下每一個商圈/街鎮中所有小區的網址
4、根據3中獲得的網址,把需要的頁面元素爬下來
三、安居客、房天下和鏈家對比:
我把三個網站的數據都爬下來了,不過最后只用了安居客的數據
四、鏈家代碼

1 import requests 2 from bs4 import BeautifulSoup 3 import re 4 import time 5 import traceback 6 import math 7 8 headers = { 9 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36', 10 'Host': 'sh.lianjia.com', 11 'Cookie': '' 12 } 13 14 def read_Lregion_dict(): 15 '''讀取行政區域的文件,並輸出為字典''' 16 with open('行政區url.txt', 'r') as f: 17 large_region_list = f.readlines() 18 large_region_dict = {} 19 for ele in large_region_list: 20 url, region = ele.split(' ') 21 region = region.replace('\n', '') 22 large_region_dict[url] = region 23 return large_region_dict 24 25 def get_jiezhen_urls(): 26 '''獲取街鎮的url''' 27 large_region_dict = read_Lregion_dict() 28 small_region_dict = {} 29 for k, v in large_region_dict.items(): 30 if v != '上海周邊': 31 url = 'https://sh.lianjia.com' + k 32 r = requests.get(url=url, headers=headers) 33 soup = BeautifulSoup(r.text, 'lxml') 34 a = soup.find(name='div', attrs={'data-role': 'ershoufang'}) 35 esf_urls = a.find_all(name='a') 36 for ele in esf_urls: 37 href = ele.attrs['href'] 38 name = ele.string 39 if name in large_region_dict.values(): 40 continue 41 else: 42 small_region_dict[href] = name 43 with open('街鎮url.txt', 'a', encoding='utf-8') as file: 44 file.write(','.join([v, name, href])) 45 file.write('\n') 46 print(v, name, href) 47 48 def region_total(url): 49 '''獲取該區域的小區數量''' 50 url = r"https://sh.lianjia.com" + url + '?from=rec' 51 r = requests.get(url=url, headers=headers) 52 soup = BeautifulSoup(r.text, 'lxml') 53 total_find = soup.find(name='h2', attrs={'class': 'total fl'}) 54 total_num = int(total_find.find(name='span').string.strip()) 55 return total_num 56 57 def get_all_urls(): 58 '''獲取所有小區名字和鏈接''' 59 with open('街鎮url.txt', 'r', encoding='utf-8') as f: 60 small_region_list = f.readlines() 61 for ele in small_region_list: 62 l_region, s_region, url = ele.split(',') 63 url = url.replace('\n', '') 64 total_num = region_total(url) 65 pages = int(math.ceil(int(total_num)/30)) 66 for i in range(1, pages+1): 67 if i == 1: 68 i = "" 69 else: 70 i = 'pg' + str(i) 71 tmp_url = r"https://sh.lianjia.com" + url + i 72 r = requests.get(url=tmp_url, headers=headers) 73 soup = BeautifulSoup(r.text, 'lxml') 74 for j in soup.find_all(name='div', attrs={'class': 'title'}): 75 community = str(j) 76 if '''target="_blank"''' in community: 77 community_list = re.search('''<a href="(.*?)" target="_blank">(.*?)</a>.*?''', community) 78 community_url = community_list.group(1) 79 community_name = community_list.group(2) 80 with open('小區url.txt', 'a', encoding='utf-8') as file: 81 file.write(','.join([l_region, s_region, community_name, community_url])) 82 file.write('\n') 83 time.sleep(1) 84 print('{}, {}總共有{}個小區,共有{}頁,已全部url爬取完成!'.format(l_region, s_region, total_num, pages)) 85 86 def get_communityInfo(l_region, s_region, community_name, community_url): 87 '''獲取某個小區的信息''' 88 r = requests.get(url=community_url, headers=headers) 89 soup = BeautifulSoup(r.text, 'lxml') 90 try: 91 unitPrice = soup.find(name='span', attrs={'class': 'xiaoquUnitPrice'}).string #小區均價 92 except: 93 unitPrice = '空' 94 try: 95 address = soup.find(name='div', attrs={'class': 'detailDesc'}).string #小區地址 96 address = '"' + address + '"' 97 except: 98 address = '空' 99 try: 100 xiaoquInfo = soup.find_all(name='span', attrs={'class': 'xiaoquInfoContent'}) #小區信息 101 xiaoquInfo_list = [l_region, s_region] 102 community_name = '"' + community_name + '"' 103 xiaoquInfo_list.append(community_name) 104 xiaoquInfo_list.append(address) 105 xiaoquInfo_list.append(unitPrice) 106 for info in xiaoquInfo: 107 xiaoquInfo_list.append(info.string) 108 xiaoquInfo_list.pop() 109 export_communityInfo(xiaoquInfo_list) 110 time.sleep(1) 111 print('已爬取{},{}的{}信息'.format(l_region, s_region, community_name)) 112 except: 113 print('{},{}的{}爬取錯誤,url是{}'.format(l_region, s_region, community_name, community_url)) 114 115 def export_communityInfo(xiaoquInfo_list): 116 '''導出小區信息''' 117 with open('上海地區小區信息.txt', 'a', encoding='utf-8') as file: 118 file.write(','.join(xiaoquInfo_list)) 119 file.write('\n') 120 121 if __name__ == "__main__": 122 # get_jiezhen_urls() #獲取街鎮的url 123 # get_all_urls() #獲取所有小區名字和鏈接 124 with open('小區url.csv', 'r') as f: 125 xiaoqu_list = f.readlines() 126 for ele in xiaoqu_list: 127 l_region, s_region, community_name, community_url = ele.split(',') 128 community_url = community_url.replace('\n', '') 129 try: 130 get_communityInfo(l_region, s_region, community_name, community_url) 131 except: 132 traceback.print_exc() 133 break
五、房天下代碼

1 import requests 2 from bs4 import BeautifulSoup 3 import pandas as pd 4 import time 5 import traceback 6 7 headers = { 8 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36', 9 'cookie': '''''' 10 } 11 12 def get_true_url(old_url): 13 '''獲得正確的url''' 14 r = requests.get(url=old_url, headers=headers) 15 if r'<title>跳轉...</title>' in r.text: 16 soup = BeautifulSoup(r.text, 'lxml') 17 new_url = soup.find(name='a', attrs={'class': 'btn-redir'}).attrs['href'] 18 return new_url 19 return old_url 20 21 def get_region_urls(): 22 '''獲得上海行政區中不同街鎮的url和名稱''' 23 sh_dict = {'浦東': '25', '嘉定': '29', '寶山': '30', '閔行': '18', '松江': '586', '普陀': '28', 24 '靜安': '21', '黃浦': '24', '虹口': '23', '青浦': '31', '奉賢': '32', '金山': '35', 25 '楊浦': '26', '徐匯': '19', '長寧': '20', '崇明': '996'} 26 for l_region_name, l_region_url in sh_dict.items(): 27 url = r"https://sh.esf.fang.com/housing/" + l_region_url + '__0_3_0_0_1_0_0_0/' 28 true_url = get_true_url(url) 29 r = requests.get(url=true_url, headers=headers) 30 soup = BeautifulSoup(r.text, 'lxml') 31 a = soup.find(name='p', attrs={'id': 'shangQuancontain', 'class': 'contain'}) 32 for i in a.find_all(name='a'): 33 if i.string != '不限': 34 this_url = r"https://sh.esf.fang.com" + i.attrs['href'] 35 this_url_list = get_region_url(this_url) 36 with open('上海地區街鎮url.txt', 'a', encoding='utf-8') as file: 37 for tmp_url in this_url_list: 38 file.write(','.join([l_region_name, i.string, tmp_url])) 39 file.write('\n') 40 print('{}已完成'.format(l_region_name)) 41 42 def get_region_url(old_url): 43 '''獲得這個區域的其它page_url''' 44 true_url = get_true_url(old_url) 45 r = requests.get(url=true_url, headers=headers) 46 soup = BeautifulSoup(r.text, 'lxml') 47 page_url = soup.find(name='div', attrs={'class': 'fanye gray6'}) 48 page_url_list = [] 49 page_url_list.append(old_url) 50 for j in page_url.find_all(name='a'): 51 if 'href' in j.attrs: 52 temp_url = r'https://sh.esf.fang.com/' + j.attrs['href'][1:] 53 if temp_url not in page_url_list: 54 page_url_list.append(temp_url) 55 return page_url_list 56 57 def get_xiaoqu_url(bigregion, smallregion, old_url): 58 '''獲得某區域某一頁的小區信息和url''' 59 true_url = get_true_url(old_url) 60 r = requests.get(url=true_url, headers=headers) 61 soup = BeautifulSoup(r.text, 'lxml') 62 j = 0 63 for i in soup.find_all(name='a', attrs={'class': 'plotTit', 'target': '_blank'}): 64 xiaoqu_type = soup.find('a', text=i.string, attrs={'class': 'plotTit', 'target': '_blank'}).parent.find('span', attrs={'class':'plotFangType'}).string 65 xiaoqu_name = i.string 66 xiaoqu_url = 'https://sh.esf.fang.com/' + i.attrs['href'][1:] 67 xiaoqu_url = xiaoqu_url.replace('.htm', '/housedetail.htm') 68 print(bigregion, smallregion, xiaoqu_name, xiaoqu_type, xiaoqu_url) 69 j += 1 70 with open('上海地區小區url.txt', 'a', encoding='utf-8') as file: 71 file.write(','.join([bigregion, smallregion, xiaoqu_name, xiaoqu_type, xiaoqu_url])) 72 file.write('\n') 73 time.sleep(1) 74 print(bigregion, smallregion, old_url, '所有小區url獲取完畢,共有{}條數據'.format(j)) 75 print('-'*100) 76 77 def get_all_urls(last_url=None): 78 '''獲得所有小區的URL''' 79 '''獲得結果后還需要清洗一下,因為有些小區跨區域,所以會有重復''' 80 with open('上海地區街鎮url.txt', 'r', encoding='utf-8') as f: 81 region_list = f.readlines() 82 event_tracking = False 83 for i in range(len(region_list)): 84 l_region, s_region, url = region_list[i].split(',') 85 url = url.replace('\n', '') 86 if last_url == url: 87 event_tracking = True 88 if event_tracking: 89 print(l_region, s_region, url) 90 get_xiaoqu_url(l_region, s_region, url) 91 92 def get_total_informations(l_region, s_region, community_name, community_type, community_url): 93 '''爬取某個小區的有用信息''' 94 r = requests.get(url=community_url, headers=headers) 95 soup = BeautifulSoup(r.text, 'lxml') 96 informations = soup.find(name='div', attrs={'class': 'village_info base_info'}) 97 if not informations: 98 print('{}, {}, {}, {}爬取失敗!'.format(l_region, s_region, community_name, community_url)) 99 return None 100 else: 101 all_info = [l_region, s_region, community_name, community_type] 102 for ele in ['本月均價', '小區地址', '產權描述', '環線位置', '建築年代', '建築面積', '占地面積', '房屋總數', '樓棟總數', '綠 化 率', '容 積 率', '物 業 費', '開 發 商', '物業公司']: 103 try: 104 all_info.append(informations.find('span', text=ele).parent.find(name='p').text.strip().replace('\r', '').replace('\n', '、').replace('\t', '').replace(',', ',')) 105 except: 106 try: 107 all_info.append(informations.find('span', text=ele).parent.find(name='a').text.strip().replace('\r', '').replace('\n', '、').replace('\t', '').replace(',', ',')) 108 except: 109 all_info.append('') 110 return all_info 111 112 def get_data(last_url=None): 113 '''主程序,爬所有小區信息''' 114 with open('上海地區小區url.txt', 'r', encoding='utf-8') as f: 115 village_list = f.readlines() 116 error_count = 0 117 if last_url == None: 118 event_tracking = True 119 else: 120 event_tracking = False 121 for i in range(len(village_list)): 122 l_region, s_region, community_name, community_type, community_url = village_list[i].split(',') 123 community_url = community_url.replace('\n', '') 124 if last_url == community_url: 125 event_tracking = True 126 if event_tracking == True: 127 if community_type=='住宅' or community_type=='別墅': 128 # print(l_region, s_region, community_name, community_type,community_url) 129 try: 130 with open('上海小區數據.txt', 'a', encoding='utf-8') as file: 131 back = get_total_informations(l_region, s_region, community_name, community_type, community_url) 132 if not back: 133 if error_count>=2: 134 break 135 else: 136 error_count +=1 137 time.sleep(1) 138 continue 139 else: 140 error_count = 0 141 file.write(','.join(back)) 142 file.write('\n') 143 print('{}, {}, {}, {}爬取成功!'.format(l_region, s_region, community_name, community_type, community_url)) 144 time.sleep(1) 145 except: 146 print('{}, {}, {}, {}爬取失敗!'.format(l_region, s_region, community_name, community_url)) 147 traceback.print_exc() 148 break 149 else: 150 continue 151 152 if __name__ == "__main__": 153 get_region_urls() #得上海行政區中不同街鎮的url和名稱 154 get_xiaoqu_url() #獲得某區域某一頁的小區信息和url,這里應該是遍歷,代碼不完全 155 get_data() #爬取所有小區信息
六、安居客代碼

1 import requests 2 from bs4 import BeautifulSoup 3 import re 4 import time 5 import traceback 6 7 headers = { 8 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36', 9 'Cookie': '' 10 } 11 12 def get_jiezhen_urls(): 13 '''獲取所有街鎮的url''' 14 lregion_dict = {'浦東': 'pudong', '閔行': 'minhang', '松江': 'songjiang', '寶山': 'baoshan', '嘉定':'jiading', 15 '徐匯':'xuhui', '青浦':'qingpu', '靜安':'jingan', '普陀':'putuo', '楊浦':'yangpu', 16 '奉賢': 'fengxian', '黃浦':'huangpu', '虹口':'hongkou', '長寧':'changning','金山':'jinshan', 17 '崇明':'chongming'} 18 for k, v in lregion_dict.items(): 19 url = 'https://shanghai.anjuke.com/community/' + v + '/' 20 r = requests.get(url=url, headers=headers) 21 soup = BeautifulSoup(r.text, 'lxml') 22 a = soup.find_all('li', attrs={'class': 'region-item'}) 23 for i in range(19, len(a)): 24 temp = a[i].find('a') 25 with open('街鎮url.txt', 'a', encoding='utf-8') as file: 26 file.write(','.join([k, temp.text, temp.attrs['href']])) 27 file.write('\n') 28 print('{}區域的url都爬取完畢!'.format(k)) 29 time.sleep(1) 30 31 def region_total(url): 32 '''獲取該區域的小區數量''' 33 r = requests.get(url=url, headers=headers) 34 soup = BeautifulSoup(r.text, 'lxml') 35 # print(soup) 36 total_find = soup.find(name='span', attrs={'class': 'total-info'}) 37 total_num = int(total_find.text.replace('共找到 ', '').replace(' 個小區', '')) 38 return total_num 39 40 def get_all_urls(): 41 '''獲取所有小區名字和鏈接''' 42 with open('街鎮url.txt', 'r', encoding='utf-8') as f: 43 small_region_list = f.readlines() 44 for ele in small_region_list: 45 l_region, s_region, url = ele.split(',') 46 url = url.replace('\n', '') 47 total_num = region_total(url) 48 pages = int(math.ceil(int(total_num)/25)) 49 for i in range(1, pages+1): 50 i = 'p' + str(i) + '/' 51 tmp_url = url + i 52 r = requests.get(url=tmp_url, headers=headers) 53 soup = BeautifulSoup(r.text, 'lxml') 54 a = soup.find_all('div', attrs={'class': 'li-info'}) 55 for ele in a: 56 xiaoqu_name = ele.find('div', attrs={'class': 'li-title'}).text.strip() 57 xiaoqu_address = ele.find('div', attrs={'class': 'props nowrap'}).text.split(' - ')[-1].strip() 58 xiaoqu_tag = ele.find_all('span', attrs={'class': 'prop-tag'}) 59 xiaoqu_url = ele.find('span', text='小區解讀').parent.find('a').attrs['href'] 60 xiaoqu_url = xiaoqu_url.replace('props/sale', 'view') 61 tag_list = [] 62 for tag in xiaoqu_tag: 63 if 'display:none' in str(tag): 64 continue 65 else: 66 tag_list.append(tag.text) 67 with open('小區url.txt', 'a', encoding='utf-8') as file: 68 file.write('$'.join([l_region, s_region, xiaoqu_name, xiaoqu_address, str(tag_list), xiaoqu_url])) 69 file.write('\n') 70 time.sleep(1) 71 print('{}, {}總共有{}個小區,共有{}頁,已全部url爬取完成!'.format(l_region, s_region, total_num, pages)) 72 73 def get_communityInfo(l_region, s_region, community_name, community_address, community_tag, community_url): 74 '''獲取某個小區的信息''' 75 r = requests.get(url=community_url, headers=headers) 76 soup = BeautifulSoup(r.text, 'lxml') 77 # print(soup) 78 if '訪問驗證-ajk' in str(soup): 79 print('觸發反爬機制了!url是', community_url) 80 exit() 81 else: 82 # print('pa蟲運行正常!') 83 try: 84 unitPrice = soup.find(name='span', attrs={'class': 'average'}).string #小區均價 85 except: 86 unitPrice = '暫無均價' 87 xiaoquInfo = soup.find_all(name='div', attrs={'class': 'hover-inner'}) #小區信息 88 xiaoquInfo_list = [l_region, s_region, community_name, community_address, community_tag, unitPrice] 89 for info in xiaoquInfo: 90 temp = info.find('div', attrs={'class': 'hover-value'}) 91 if temp: 92 xiaoquInfo_list.append(temp.text.replace('\n', '').strip()) 93 export_communityInfo(xiaoquInfo_list) 94 time.sleep(0.5) 95 print('已pa取{},{}的{}信息'.format(l_region, s_region, community_name)) 96 97 def export_communityInfo(xiaoquInfo_list): 98 '''導出小區信息''' 99 with open('上海地區小區信息.txt', 'a', encoding='utf-8') as file: 100 file.write('&'.join(xiaoquInfo_list)) 101 file.write('\n') 102 103 if __name__ == "__main__": 104 # get_jiezhen_urls() #獲取所有街鎮的url 105 # get_all_urls() #獲取所有小區名字和鏈接 106 with open('小區url.txt', 'r', encoding='utf-8') as f: 107 xiaoqu_list = f.readlines() 108 last_url = 'https://shanghai.anjuke.com/community/view/8338/' 109 stop_place = False 110 for ele in xiaoqu_list: 111 l_region, s_region, community_name, community_address, community_tag, community_url = ele.split('$') 112 community_url = community_url.replace('\n', '') 113 if community_url == last_url or last_url == '': 114 stop_place = True 115 if stop_place: 116 try: 117 get_communityInfo(l_region, s_region, community_name, community_address, community_tag, community_url) 118 except: 119 print('{}爬取失敗,url是:{}'.format(community_name, community_url)) 120 traceback.print_exc() 121 break
七、數據清洗和特征工程
獲取的數據很臟,有重復值需要剔重,有異常值需要修正(比如明顯是外環的數據被歸納為內環);需要根據業務場景,區分小區是否高檔;需要根據需要,與內部數據結合…
我這邊就舉例幾種場景,供大家參考(以安居客數據為例):
1、從標簽中判斷小區是否靠近地鐵
1 data['是否靠近地鐵'] = data['標簽'].apply(lambda x: '是' if '近地鐵' in str(x) or '號線' in str(x) else '否')
2、從標簽中判斷環線位置
1 def huanxian_position(text):
2 '''環線位置'''
3 if '內環以內' in str(text): 4 return '內環以內' 5 elif '內中環之間' in str(text): 6 return '內中環之間' 7 elif '郊環以外' in str(text): 8 return '郊環以外' 9 elif '外郊環之間' in str(text): 10 return '外郊環之間' 11 elif '中外環之間' in str(text): 12 return '中外環之間' 13 else: 14 return np.nan 15 16 data['環線位置'] = data['標簽'].apply(huanxian_position)
3、糾正環線位置
1 data_pivot = data.pivot_table(index='所屬商圈', columns='環線位置', values='名稱', aggfunc='count').reset_index()
2 data_pivot['環線位置2'] = ''
3 for i in range(data_pivot.shape[0]): 4 huan_dict = {} 5 huan_dict['中外環之間'] = data_pivot.iloc[i,1] 6 huan_dict['內中環之間'] = data_pivot.iloc[i,2] 7 huan_dict['內環以內'] = data_pivot.iloc[i,3] 8 huan_dict['外郊環之間'] = data_pivot.iloc[i,4] 9 huan_dict['郊環以外'] = data_pivot.iloc[i,5] 10 best_answer = '' 11 best_v = 0 12 for k,v in huan_dict.items(): 13 if v == np.nan: 14 continue 15 elif v >= best_v: 16 best_answer = k 17 else: 18 continue 19 data_pivot.iloc[i,6] = best_answer 20 21 huan_dict = {} 22 for k,v in zip(data_pivot['所屬商圈'].values, data_pivot['環線位置2'].values): 23 huan_dict[k] = v 24 25 data['環線位置'] = data['所屬商圈'].map(huan_dict)
4、根據竣工時間判斷小區年齡
1 def new_age(text):
2 '''竣工時間推導小區年齡'''
3 if str(text) != 'nan': 4 text = 2022 - int(text.split('、')[0].replace('年','')) 5 return text 6 else: 7 return np.nan 8 9 data['小區年齡'] = data['竣工時間'].apply(new_age)
5、判斷是否商務樓宇、園區等(鏈家)
1 def if_business(text):
2 '''判斷是否商務樓宇、園區等'''
3 for ele in ['商務', '園區', '大廈', '寫字樓', '廣場']: 4 if ele in text: 5 return '是' 6 else: 7 return '否' 8 9 data['是否商務樓宇等'] = data['小區名稱'].apply(if_business)
6、提取物業費上下限(鏈家)
1 def wuyefei_down(text):
2 '''輸出物業費下限'''
3 if text is np.nan: 4 return np.nan 5 elif '至' not in text: 6 return text.replace('元/平米/月','') 7 else: 8 down, up = text.split('至') 9 return down.replace('元/平米/月','') 10 11 def wuyefei_up(text): 12 '''輸出物業費上限''' 13 if text is np.nan: 14 return np.nan 15 elif '至' not in text: 16 return text.replace('元/平米/月','') 17 else: 18 down, up = text.split('至') 19 return up.replace('元/平米/月','') 20 21 data['物業費下限'] = data['物業費'].apply(wuyefei_down) 22 data['物業費上限'] = data['物業費'].apply(wuyefei_up)
7、判斷小區名字是否有地址
1 def if_number(text):
2 '''判斷小區名稱里是否有數字'''
3 if bool(re.search(r'\d', text)): 4 return '是' 5 else: 6 return '否' 7 8 data['小區名稱里是否有數字'] = data['名稱'].apply(if_number)
8、匹配百度經緯度
1 from urllib.request import urlopen, quote
2 import json 3 import math 4 from math import radians, cos, sin, asin, sqrt 5 import requests 6 7 def getjwd_bd(address): 8 '''根據地址獲得經緯度(百度)''' 9 try: 10 url = 'http://api.map.baidu.com/geocoding/v3/?address=' 11 output = 'json' 12 ak = '******'#需填入自己申請應用后生成的ak 13 add = quote(address) #本文城市變量為中文,為防止亂碼,先用quote進行編碼 14 url2 = url+add+'&output='+output+"&ak="+ak 15 req = urlopen(url2) 16 res = req.read().decode() 17 temp = json.loads(res) 18 lng = float(temp['result']['location']['lng']) # 經度 Longitude 簡寫Lng 19 lat = float(temp['result']['location']['lat']) # 緯度 Latitude 簡寫Lat 20 return lng, lat 21 except: 22 return np.nan, np.nan 23 24 for i in tqdm(range(data.shape[0])): 25 region = data.iloc[i, 0] 26 if region=='浦東': 27 region = '上海市浦東新區' 28 else: 29 region = '上海市'+ region + '區' 30 xiaoqu_name = data.iloc[i, 2] 31 address = data.iloc[i, 3] 32 if str(data.iloc[i, 19]) !='nan': 33 continue 34 else: 35 lng1, lat1 = getjwd_bd(region+address+xiaoqu_name) 36 if 120<=lng1<=122 and 30<=lat1<=32: 37 data.iloc[i, 19] = lng1 38 data.iloc[i, 20] = lat1 39 else: 40 data.iloc[i, 19] = np.nan 41 data.iloc[i, 20] = np.nan
9、計算兩個經緯度之間的距離(用於與內部數據匹配)
1 def get_distance(lng1,lat1,lng2,lat2):
2 '''計算距離'''
3 lng1, lat1, lng2, lat2 = map(radians, [float(lng1), float(lat1), float(lng2), float(lat2)]) # 經緯度轉換成弧度
4 dlon = lng2 - lng1 5 dlat = lat2 - lat1 6 a = sin(dlat / 2) ** 2 + cos(lat1) * cos(lat2) * sin(dlon / 2) ** 2 7 distance = 2 * asin(sqrt(a)) * 6371393 # 地球平均半徑,6371km 8 distance = round(distance, 0) 9 return distance 10 11 for i in tqdm(range(data.shape[0])): 12 xiaoqu_name = data.iloc[i, 2] 13 lng1 = data.iloc[i, 18] 14 lat1 = data.iloc[i, 19] 15 match_wg = data.iloc[i, 24] 16 min_distance = 9999999999 17 tmp_grid_cd = '' 18 tmp_grid_name = '' 19 # print(xiaoqu_name, lng1, lat1) 20 if str(match_wg) != 'nan': 21 print('{}已匹配,跳過'.format(xiaoqu_name)) 22 continue 23 else: 24 for j in range(grid_data.shape[0]): 25 lng2 = grid_data.iloc[j, 19] 26 lat2 = grid_data.iloc[j, 20] 27 grid_cd = grid_data.iloc[j, 0] 28 grid_name = grid_data.iloc[j, 1] 29 if str(lng2) == 'nan': 30 continue 31 else: 32 tmp_distance = get_distance(lng1, lat1, lng2, lat2) 33 # print(grid_name,tmp_distance) 34 if tmp_distance == 0: 35 print('{}精確匹配的網格是{}'.format(xiaoqu_name, grid_name)) 36 data.iloc[i, 24] = grid_cd 37 print('-'*100) 38 break 39 else: 40 if tmp_distance < min_distance: 41 min_distance = tmp_distance 42 tmp_grid_cd = grid_cd 43 tmp_grid_name = grid_name 44 # print(min_distance, tmp_grid_cd, tmp_grid_name) 45 else: 46 continue 47 else: 48 data.iloc[i, 24] = tmp_grid_cd 49 print('{}模糊匹配的網格是{}'.format(xiaoqu_name, tmp_grid_name)) 50 print(min_distance, tmp_grid_cd, tmp_grid_name) 51 print('-'*100)
10、找出區域內top10%均價的房子
1 region_dict = data['行政區'].value_counts().to_dict()
2 top10_list = [] 3 for k, v in region_dict.items(): 4 temp_data = data[data['行政區']==k] 5 temp_data = temp_data.sort_values(by='均價', ascending=False).reset_index() 6 temp_top10 = temp_data.iloc[:int(v*0.1), :] 7 top10_index = temp_top10['index'].to_list() 8 top10_list.extend(top10_index) 9 10 data['是否區域內均價top10%'] = '否' 11 for i in top10_list: 12 data.loc[i, '是否區域內均價top10%'] = '是'
11、判斷是否高檔小區
1 def if_upscale(df):
2 '''判斷是否高檔小區'''
3 if df['物業類型'] == '別墅': 4 return '是' 5 elif df['均價'] <=30000: 6 return '否' 7 elif df['小區年齡'] <= 10 and df['環線位置'] in ('內環以內', '內中環之間', '中外環之間'): 8 return '是' 9 elif df['物業費'] >= 3: 10 return '是' 11 elif df['是否區域內均價top10%'] == '是': 12 return '是' 13 else: 14 return '否' 15 16 data['是否高檔小區'] = data.apply(if_upscale, axis=1)