python爬取鏈家租房信息


import requests as rq
from bs4 import BeautifulSoup
import json
import time
import pandas as pd

home_url = 'https://bj.lianjia.com/zufang'
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36'
}

# 首頁
home_rt = rq.get(home_url, headers=headers).text
home_soup = BeautifulSoup(home_rt, 'lxml')

# 從首頁獲取到各個區域的入口鏈接
district_url_rt = home_soup.find_all('li', attrs={'class': 'filter__item--level2', 'data-type': 'district'}) 
district_urls = []
for i in range(1,len(district_url_rt)):
    district_name = district_url_rt[i].a.string  # 區域名稱
    dis_url = district_url_rt[i].a.attrs['href']  
    dis_url = 'https://bj.lianjia.com' + dis_url # 區域鏈接
    district_urls.append([district_name, dis_url])

print(district_urls)
print('區域接口獲取完畢')

finally_house_result = []
# 遍歷各個區域鏈接,分別從每個入口中獲取到信息
for dis_url in district_urls:
    time.sleep(5)
    district_name = dis_url[0] + ''
    district_url = dis_url[1]
    district_rt = rq.get(district_url, headers=headers)
    district_rt = district_rt.text
    district_soup = BeautifulSoup(district_rt, 'lxml')
    page_num = int(district_soup.find('div', attrs={'class': 'content__pg'}).attrs['data-totalpage'])  # 當前區域房屋信息 網頁數
    
    # 遍歷所有頁,獲取所有頁 房屋標題+url
    house_titurl = []
    for page in range(1, page_num+1):
        time.sleep(0.8)
        page_url = district_url + f'/pg{page}'  # 當前頁面鏈接
        page_results = rq.get(page_url, headers=headers).text
        page_soup = BeautifulSoup(page_results)
        current_page_rts = page_soup.find_all('div', attrs={'class': 'content__list--item'})  # 當前頁面區域房屋信息列表
        
        # 遍歷當前頁面,獲取 所有房屋 標題+ url
        for houselist_rt in current_page_rts:  
            house_url = 'https://bj.lianjia.com' + houselist_rt.a['href']  # urs
            house_title = houselist_rt.a.img['alt']  # 標題          
            address_list = houselist_rt.div.find('p', attrs={'class': 'content__list--item--des'}).find_all('a')
            address = address_list[1].string + '.' + address_list[2].string  # 地址
            house_titurl.append([house_title, address, house_url])
    district_num = len(house_titurl)
    print(f'{district_name}房屋標題&url獲取完畢,共{district_num}套租房信息')
    
    # 遍歷當前區域所有的房屋標題+鏈接,獲取房屋具體信息
    for house_page in house_titurl:
        time.sleep(0.6)
        house_title = house_page[0]  # 房屋標題
        address = house_page[1]  # 地址
        house_url = house_page[2]  # 房屋鏈接
        house_rt = rq.get(house_url, headers=headers).text
        house_soup = BeautifulSoup(house_rt)
        
        house_rt1 = house_soup.find_all('li', attrs={'class': 'table_col'})
        pay_method = house_rt1[5].string  # 支付方式
        rent = house_rt1[6].string + house_rt1[1].find('span').string  # 房租
        deposit = house_rt1[7].string + house_rt1[2].find('span').string  # 押金
        service_fee = house_rt1[8].string + house_rt1[3].find('span').string  # 服務費
        agency_fee = house_rt1[9].string + house_rt1[4].find('span').string  # 中介費
        
        house_rt2 = house_soup.find_all('li', attrs={'class': 'fl oneline'})
        size = house_rt2[1].string[3:]  # 面積
        toward = house_rt2[2].string[3:]  # 朝向
        in_time = house_rt2[5].string[3:]  # 入住時間
        rent_term = house_rt2[7].string[3:]  # 租期
        storey = house_rt2[10].string[3:]  # 樓層
        elevator = house_rt2[11].string[3:]  # 電梯
        gas = house_rt2[17].string[3:]  # 燃氣

        # 配套設施
        supporting_facilities = []
        for faci in range(21, len(house_rt2)):
            supporting_facilities.append(house_soup.find_all('li', attrs={'class': 'fl oneline'})[faci].text.strip())
        supporting_facilities = json.dumps(supporting_facilities, ensure_ascii=False)
        
        # 中介信息
        agency_names = house_soup.find_all('a', attrs={'class': 'name'})
        agency_phones = house_soup.find_all('div', attrs={'class': 'phone'})
        agency_scores = house_soup.find_all('div', attrs={'class': 'rate'})
        agency_list = []
        for name, phone, score in zip(agency_names, agency_phones, agency_scores):
            agency_list.append({'中介姓名': name.string, '電話': phone.string, '評分': score.text.strip()})
        agency_list = json.dumps(agency_list, ensure_ascii=False)
        
        finally_house_result.append([district_name, address, house_title, size, toward, storey, elevator, gas, supporting_facilities, rent_term, in_time, rent, deposit, service_fee, agency_fee, agency_list])
    print(f'{district_name}房屋信息獲取完畢,共{district_num}套')

data_num = len(finally_house_result)
columns = ['區域', '地址', '標題', '面積', '朝向', '樓層', '電梯', '燃氣', '配套設施', '租期', '入住時間', '房租', '押金', '服務費', '中介費', '中介聯系方式']
house_finally_dfdata = pd.DataFrame(finally_house_result, columns=columns)
house_finally_dfdata.to_excel('d:\\Desktop\\20191124鏈家北京各城區租房信息.xlsx')
print(f'北京市各城區租房信息獲取完畢,共{data_num}套')

 


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM