#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# Author;Tsukasa
import requests
from bs4 import BeautifulSoup
import pandas
import time
url_all = []
url_in = input('輸入你所需要城市的字母簡寫:\n如:中山 zs , 廣州 gz\n!!!不要亂輸入,不然運行不了')
url_number = 1+int(input('輸入爬取頁數:'))
okl = []
def open(nobe):
res = requests.get(nobe)
soup = BeautifulSoup(res.text,'html5lib')
http_start = []
url_start = 'http://esf.'+url_in+'.fang.com'
for title in soup.select('.houseList dl'): #網址鏈接列表
url_end = title.select('.title a ')[0]['href']
http_start.append(url_start + url_end)
return http_start
#獲取詳細信息
def content(url):
info = {}
info['網頁'] = url
res = requests.get(url)
soup = BeautifulSoup(res.text,'html5lib')
info['標題'] = soup.select('h1')[0].text.strip() #獲取標題
info['總價'] = soup.select('.red20b')[0].text + '萬' #總價
info['聯系電話'] = soup.select('#mobilecode')[0].text #電話
for sl in soup.select('span'): #獲取發布時間
if '發布時間' in sl.text.lstrip('<span>'):
key , value = (sl.text.strip().rstrip('(').split(':'))
info[key] = value + '*' + soup.select('#Time')[0].text
for dd in soup.select('dd'): #獲取詳細內容
if ':' in dd.text.strip():
key , value = (dd.text.strip().split(':'))
info[key] = value
return info
print('----------正在運行,請不要關閉----------')
url_home = ('http://esf.'+ url_in + '.fang.com/house/i3{}/')
for url_next in range(1,url_number):
url_all.append((url_home.format(url_next)))
home = []
for i in url_all:
a = (open(i))
print('正在獲取 -----> ',i,' <-----')
time.sleep(1)
for b in a:
home.append(content(b))
print('\t正在獲取詳細信息 -> ',b,' <-----')
time.sleep(2)
#home.append(content(open(i[0])))
last = pandas.DataFrame(home)
last.to_excel('temp.xlsx',sheet_name='房源信息')
print('----------運行結束----------\n\n----------查看根目錄---------')
abcdefg = input('完成運行')
源碼先奉上,以后在填坑
