python 爬取鏈家二手房信息


1、網頁分析(獲取所有城市列表)

citys.py

'''
Created on 2017-10-9

@author: wbhuangzhiqiang
'''
import csv
import urllib.request
from bs4 import BeautifulSoup
url='https://www.lianjia.com'
#獲取html
# 獲取 html 頁面
html = urllib.request.urlopen(url).read()

# 獲取 BeautifulSoup 對象,用 html5lib 解析(也可用 lxml 或其它方式解析,html5lib 容錯性較好,所以此處選用 html5lib )
bsobj = BeautifulSoup(html, "html5lib")
# 得到 class="cityList clear" 的 div 下所有 a 標簽 
city_tags = bsobj.find("div",{"class":"cityList clear"}).findChildren("a")
print(city_tags)

# 將每一條數據抽離,保存在 citys.csv 文件中
with open("./citys.csv", "w") as f:
    writ = csv.writer(f)
    for city_tag in city_tags:
        # 獲取 <a> 標簽的 href 鏈接
        city_url = city_tag.get("href")
        # 獲取 <a> 標簽的文字,如:天津
        city_name = city_tag.get_text()
        writ.writerow((city_name, city_url))
        print (city_name, city_url)

2、二手房信息

'''
Created on 2017-10-9

@author: wbhuangzhiqiang
'''
import sys
import re
import csv
import urllib.request
from bs4 import BeautifulSoup

# 成功打開頁面時返回頁面對象,否則打印錯誤信息,退出程序
def  get_bsobj(url):
    page = urllib.request.urlopen(url)
    if page.getcode() == 200:
        html = page.read()
        bsobj = BeautifulSoup(html, "html5lib")
        return bsobj
    else:
        print ("頁面錯誤")
        sys.exit()
        
def get_house_info_list(url):
    house_info_list = []
    bsobj = get_bsobj(url)
    if not bsobj:
        return None
    #獲取頁數
    global house_info_page
    house_page = bsobj.find("a", {"gahref":"results_totalpage"})
    house_info_page=int(house_page.get_text())
    #print(house_info_page)
    house_list = bsobj.find_all("div", {"class":"info"})
    for  house in house_list:
        #title = house.find("div", {"class": "prop-title"}).get_text().split("|")
        # 獲取信息數據(例:加怡名城 | 2室1廳 | 62.48平米 | 西 | 精裝),通過“|”符號分割字符串
        info = house.find("span", {"class": "info-col row1-text"}).get_text().split("|")
        #print("==========1====")
        info2 = house.find("span", {"class": "info-col row2-text"}).get_text().split("|")
        #print("==========2====") 
        #print(info2)
        #print("==========2====") 
        #print(info2)
        #print("==========2====")    
        minor = house.find("span", {"class": "info-col price-item minor"}).get_text().strip()
        # 小區(例:加怡名城),strip()去除字符串兩邊的空格,encode,將字符串編碼成 utf-8 格式
        block = info2[1].strip()+info2[2].strip()+info2[0].strip()
        if len(info2)>3:
            naidai = info2[3].strip()
        else:
            naidai='未知'
        #房型
        house_type =info[0].strip()
        #面積
        size =info[1].strip()
        price_sz = house.find("span", {"class": "total-price strong-num"}).get_text()
        price_dw = house.find("span", {"class": "unit"}).get_text()
        price =price_sz+price_dw
        #print(price)    
        house_info_list.append({'房型':house_type,'面積':size,'價格':price,'房屋位置':block,'年代':naidai,'單價':minor})
    #print('**********************')
    #print(house_info_list)
    #print(len(house_info_list))
    return  house_info_list
# 讀取前100個頁面的房屋信息,將信息保存到 house.csv 文件中
def  house_mess(url):
    house_info_list =[]
    get_house_info_list(url)
    if house_info_page>20:
        for  i in range(0,21):
            new_url = url +'/d'+str(i)
            house_info_list.extend(get_house_info_list(new_url))
            #print(new_url)
        #print(house_info_list)
    #print("****************house_info_list*********************")
    #print(house_info_list)

    if house_info_list:
        # 將數據保存到 house.csv 文件中
        with open("./house.csv", "w",newline='') as f:
            # writer 對象
            writer = csv.writer(f)
            fieldnames=house_info_list[0].keys()
            writer.writerow(fieldnames)
            for house_info in house_info_list:
                #print('&&&&&&&&&&&&&&&&&&&&&&&')
                #print(house_info)

                writer.writerow(house_info.values())
#house_mess('http://sh.lianjia.com/ershoufang/minhang')

3、main.py

'''
Created on 2017-10-9

@author: wbhuangzhiqiang
'''
#coding=gbk
import csv
import sys
import urllib.request
from bs4 import BeautifulSoup
from house_info import house_mess
def  get_city_dict():
    city_dict = {}
    with open('./citys.csv', 'r') as  f:
        reader =csv.reader(f)
        for  city in reader:
            if len(city)>0:
                city_dict[city[0]] = city[1]
    return city_dict
city_dict = get_city_dict()
#print(city_dict)
# 打印所有的城市名



def get_district_dict(url):
    district_dict = {}
    html = urllib.request.urlopen(url).read()
    bsobj = BeautifulSoup(html, "html5lib")
    roles = bsobj.find("div", {"class":"level1"}).findChildren("a")
    for role in roles:
        # 對應區域的 url
        district_url = role.get("href")
        # 對應區域的名稱
        district_name = role.get_text()
        # 保存在字典中
        district_dict[district_name] = district_url
    return district_dict

def   run():
    city_dict = get_city_dict()
    for city in city_dict.keys():
        print(city,end=' ')
    print() 
    key_city= input("請輸入城市  ")
    # 根據用戶輸入的城市名,得到城市 url
    city_url = city_dict.get(key_city)
    # 根據用戶輸入的城市名,得到城市 url
    if city_url:
        print (key_city, city_url)
    else:
        print( "輸入錯誤")
        # 退出
        sys.exit()
    ershoufang_city_url = city_url + "/ershoufang"
    print(ershoufang_city_url)
    district_dict = get_district_dict(ershoufang_city_url)
    # 打印區域名
    for district in district_dict.keys():
        print (district,end=' ')
    print()
    

    input_district = input("請輸入地區:")
    district_url = district_dict.get(input_district)

    # 輸入錯誤,退出程序
    if not district_url:
        print ("輸入錯誤")
        sys.exit()
    # 如果都輸入正確
    house_info_url = city_url + district_url
    house_mess(house_info_url)

if __name__ == "__main__":
    run()
        

4、以上海閔行為例,house.csv 爬取的內容為

 

結果表明,上海房價真的是高啊~~

 


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM