爬取安居客指定市的所有小區信息


在爬取的過程中發現,訪問頻率太快會導致網站彈出滑動驗證,所以設定了時間隨機時間延遲,這樣子就能保證爬取的信息完整,我選的是青島市的小區,后續也可以添加輸入市名爬取相關內容,二級頁面的房子的平均價格是動態生成的,需要發送一個請求得到一個json,請求的url比較復雜,而且還要再發送一次請求,因此直接在一級頁面取平均價格,然后傳入解析二級頁面的函數,這樣可以提高效率.代碼如下:

"""
    爬取安居客所有小區信息
"""
import requests
from fake_useragent import UserAgent from lxml import etree import csv import re import time import random class AnjukeSpider(object): def __init__(self): self.url = 'https://qd.anjuke.com/community/p{}/' def get_headers(self): """ 構建請求頭 :return: """ ua = UserAgent() headers = { "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3", "accept-encoding": "gzip, deflate, br", "accept-language": "zh-CN,zh;q=0.9", "cache-control": "max-age=0", "cookie": "aQQ_ajkguid=534DDCC9-5DBA-263A-CF4D-SX0716083828; isp=true; 58tj_uuid=e559fdad-fdb9-4a73-8c60-9e6e3bf82987; Hm_lvt_c5899c8768ebee272710c9c5f365a6d8=1563237510; als=0; _ga=GA1.2.1881437242.1569052175; ctid=30; wmda_uuid=edd62dcc1e73bddc16beeb56087fd1f8; wmda_new_uuid=1; wmda_visited_projects=%3B6289197098934; sessid=F6826357-F68F-1E17-B5A1-99FEA17341CA; lps=http%3A%2F%2Fwww.anjuke.com%2F%7Chttps%3A%2F%2Fwww.baidu.com%2Flink%3Furl%3DcuNIKoO-jX3CGzD7komT_lY2umPIHgZjjBdMMdFnpZHirHVPOLorVTafN32HS5R_%26ck%3D7150.2.84.414.190.439.289.917%26shh%3Dwww.baidu.com%26sht%3D02003390_42_hao_pg%26wd%3D%26eqid%3Dc2951ba5003c81ad000000065d881f86; twe=2; wmda_session_id_6289197098934=1569202063874-b62b0050-2be7-3851; _gid=GA1.2.388348263.1569202065; init_refer=https%253A%252F%252Fwww.baidu.com%252Flink%253Furl%253DcuNIKoO-jX3CGzD7komT_lY2umPIHgZjjBdMMdFnpZHirHVPOLorVTafN32HS5R_%2526ck%253D7150.2.84.414.190.439.289.917%2526shh%253Dwww.baidu.com%2526sht%253D02003390_42_hao_pg%2526wd%253D%2526eqid%253Dc2951ba5003c81ad000000065d881f86; new_uv=3; new_session=0", "referer": "https://qd.anjuke.com/community/?from=navigation", "sec-fetch-mode": "navigate", "sec-fetch-site": "none", "sec-fetch-user": "?1", "upgrade-insecure-requests": "1", "user-agent": ua.random } return headers def get_link(self, url): """ 解析頁面獲取每個小區的二級頁面鏈接和價格 :param url: :return: """ text = requests.get(url=url, headers=self.get_headers()).text html = etree.HTML(text) link = html.xpath("//h3/a/@href") price = html.xpath('//*[@id="list-content"]/div/div[2]/p[1]/strong/text()') print(link) print(price) for i in zip(link, price): print(i) return zip(link, price) def parse_message(self, url, price): """ 二級頁面解析需要的信息 :param url: :param price: :return: """ dict_result = {'小區': '-', '地址': '-', '價格': '-', '物業類型:': '-', '物業費:': '-', '總建面積:': '-', '總戶數:': '-', '建造年代:': '-', '停車位:': '-', '容積率:': '-', '綠化率:': '-', '開發商:': '-', '物業公司:': '-', '所屬商圈:': '-', '二手房房源數:': '-', '租房源數:': '-', '相關學校:': '-'} text = requests.get(url=url, headers=self.get_headers()).text html = etree.HTML(text) table1 = html.xpath('/html/body/div[2]/div[3]/div[1]/h1//text()') # 提取小區名和地址 table1 = list(map(lambda item: re.sub('\s+', '', item), table1)) # 去掉換行符制表符 table1 = list(filter(None, table1)) # 去掉上一步產生的空元素 dict_result['小區'] = table1[0] dict_result['地址'] = table1[1] dict_result['價格'] = price table2 = html.xpath('//*[@id="basic-infos-box"]/dl//text()') table2 = list(map(lambda item: re.sub('\s+', '', item), table2)) table2 = list(filter(None, table2)) table2_list1 = table2[::2] table2_list2 = table2[1::2] table2_list3 = zip(table2_list1, table2_list2) for j in table2_list3: dict_result[j[0]] = j[1] # price = html.xpath('//*[@id="basic-infos-box"]/div[1]/span[1]/text()') #價格數據在json文件里面,所以這個沒辦法匹配到 # dict_result['價格'] = price[0] table3 = html.xpath('//*[@id="basic-infos-box"]/div[2]//text()') table3 = list(map(lambda item: re.sub('\s+', '', item), table3)) table3 = list(filter(None, table3)) table3_list1 = table3[::2] table3_list2 = table3[1::2] table3_list3 = zip(table3_list1, table3_list2) for j in table3_list3: dict_result[j[0]] = j[1] print(dict_result) return dict_result def save_csv(self, result): """ 將信息保存進入csv文件 :param result: :return: """ headers = {'小區', '地址', '價格', '物業類型:', '物業費:', '總建面積:', '總戶數:', '建造年代:', '停車位:', '容積率:', '綠化率:', '開發商:', '物業公司:', '所屬商圈:', '二手房房源數:', '租房源數:', '相關學校:'} with open('青島.csv', 'a', newline='') as f: writer = csv.DictWriter(f, headers) # writer.writeheader() for row in result: writer.writerow(row) def run(self): """ 主函數 :return: """ C = 1 for i in range(1, 101): # 總的272頁 url = self.url.format(i) link = self.get_link(url) list_result = [] for j in link: try: result = self.parse_message(j[0], j[1]) time.sleep(round(random.randint(1, 3), C)) list_result.append(result) except Exception as err: print(err) self.save_csv(list_result) print("第%s頁儲存成功" % i) # url = 'https://qd.anjuke.com/community/view/875393?from=Filter_1&hfilter=filterlist' # self.parse_message(url) # self.get_link() if __name__ == '__main__': spider = AnjukeSpider() spider.run()


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM