Python爬取房天下某城市數據
隨着互聯網時代的興起,技術日新月異,掌握一門新技術對職業發展有着很深遠的意義,做的第一個demo,以后會在爬蟲和數據分析方便做更深的研究,本人不會做詳細的文檔,有哪里不足的地方,希望大牛們指點講解。廢話不多說,上代碼。
你需要的技能:
(1)對前端知識熟悉會調試瀏覽器
(2)熟練python基礎知識,對一些常用的庫熟練掌握
(3)掌握一般關系型數據庫
import requests as req import time import pandas as pd from bs4 import BeautifulSoup from sqlalchemy import create_engine global info def getHouseInfo(url): info = {} soup = BeautifulSoup(req.get(url).text,"html.parser") resinfo = soup.select(".tab-cont-right .trl-item1") # 獲取戶型、建築面積、單價、朝向、樓層、裝修情況 for re in resinfo: tmp = re.text.strip().split("\n") name = tmp[1].strip() if("朝向" in name): name = name.strip("進門") if("樓層" in name): name = name[0:2] if("地上層數" in name): name = "樓層" if("裝修程度" in name): name = "裝修" info[name] = tmp[0].strip() xiaoqu = soup.select(".rcont .blue")[0].text.strip() info["小區名字"] = xiaoqu zongjia = soup.select(".tab-cont-right .trl-item") info["總價"] = zongjia[0].text return info domain = "http://esf.anyang.fang.com/" city = "house/" #獲取總頁數 def getTotalPage(): res = req.get(domain+city+"i31") soup = BeautifulSoup(res.text, "html.parser") endPage = soup.select(".page_al a").pop()['href'] pageNum = endPage.strip("/").split("/")[1].strip("i3") print("loading.....總共 "+pageNum+" 頁數據.....") return pageNum # 分頁爬取數據 def pageFun(i): pageUrl = domain + city + "i3" +i print(pageUrl+" loading...第 "+i+" 頁數據.....") res = req.get(pageUrl) soup = BeautifulSoup(res.text,"html.parser") houses = soup.select(".shop_list dl") pageInfoList = [] for house in houses: try: # print(domain + house.select("a")[0]['href']) info = getHouseInfo(domain + house.select("a")[0]['href']) pageInfoList.append(info) print(info) except Exception as e: print("---->出現異常,跳過 繼續執行",e) df = pd.DataFrame(pageInfoList) return df connect = create_engine("mysql+pymysql://root:root@localhost:3306/houseinfo?charset=utf8") for i in range(1,int(getTotalPage())+1): try: df_onePage = pageFun(str(i)) except Exception as e: print("Exception",e) pd.io.sql.to_sql(df_onePage, "city_house_price", connect, schema="houseinfo", if_exists="append")