從貝殼網獲取房價信息。
基本的步驟和我的這篇博文一樣:https://www.cnblogs.com/mrlayfolk/p/12319414.html。不熟悉的可參考一下。
下面的代碼是獲取3000個樣本的代碼。
1 # encoding:utf-8 2 3 ''' 4 目的:從貝殼找房中爬取房價信息。網址:https://cd.ke.com/ershoufang/qingyang/l2/ 5 環境:python 3.7.3 6 所需的庫:requests、BeautifulSoup、xlwt 7 ''' 8 9 import logging 10 import xlwt 11 import requests 12 import string 13 from bs4 import BeautifulSoup 14 15 headers = { 16 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36',\ 17 "Host": "cd.ke.com", 18 } 19 20 # 將獲取的信息保存到表格中 21 def save_info(content): 22 workbook = xlwt.Workbook(encoding = 'ascii') 23 worksheet = workbook.add_sheet('house info') 24 style = xlwt.XFStyle() # 初始化樣式 25 font = xlwt.Font() # 為樣式創建字體 26 font.name = 'Times New Roman' 27 font.bold = True # 黑體 28 font.underline = True # 下划線 29 font.italic = True # 斜體字 30 style.font = font # 設定樣式 31 worksheet.write(0, 0, '名稱') 32 worksheet.write(0, 1, '位置') 33 worksheet.write(0, 2, '房屋信息') 34 worksheet.write(0, 3, '總價(萬)') 35 worksheet.write(0, 4, '單價(元/平方米)') 36 37 for i, item in enumerate(content): 38 for j in range(5): #多添加一列(序號) 39 worksheet.write(i+1, j, content[i][j]) 40 workbook.save('./house_info.xls') # 保存文件 41 42 43 # 獲取房屋相關的信息 44 # 主要包括:title positon houseinfo totalprice unitprice 45 def get_info(): 46 all_info = [] 47 title_list = [] 48 position_list = [] 49 house_list = [] 50 totalPrice_list = [] 51 unitPrice_list = [] 52 53 for i in range(100): 54 link = 'https://cd.ke.com/ershoufang/qingyang/pg%dl2/' % i 55 r = requests.get(link, headers=headers, timeout=10) 56 print (str(i+1), 'status_code: ', r.status_code) 57 soup = BeautifulSoup(r.text, 'lxml') 58 titleInfo = soup.findAll('div', {'class': 'info clear'}) 59 positionInfo = soup.findAll('div', {'class': 'positionInfo'}) 60 houseInfo = soup.findAll('div', {'class': 'houseInfo'}) 61 totalPrice = soup.findAll('div', {'class': 'totalPrice'}) 62 unitPrice = soup.findAll('div', {'class': 'unitPrice'}) 63 for item in titleInfo: 64 title = item.div.a.text.strip() 65 title_list.append(title) 66 for item in positionInfo: 67 postion = item.a.text.strip() 68 position_list.append(postion) 69 for item in houseInfo: 70 house = item.text.strip().replace('\n', ' ').replace(' ', '') 71 house_list.append(house) 72 for item in totalPrice: 73 total_price = item.span.text.strip() 74 totalPrice_list.append(total_price) 75 for item in unitPrice: 76 unit_price = item.span.text.strip().replace('單價', '').replace('元/平米', '') 77 unitPrice_list.append(unit_price) 78 print (len(title_list)) 79 print (len(position_list)) 80 print (len(house_list)) 81 print (len(totalPrice_list)) 82 print (len(unitPrice_list)) 83 for i in range(len(title_list)): 84 item = [title_list[i], position_list[i], house_list[i], totalPrice_list[i], unitPrice_list[i]] 85 all_info.append(item) 86 87 return all_info 88 89 90 if __name__ == "__main__": 91 all_info = get_info() 92 save_info(all_info)
