python爬蟲：找房助手V1.0-爬取58同城租房信息

本文轉載自查看原文 2016-06-21 09:34 4223 python spider

1.用於爬取58上的租房信息，限成都，其他地方的，可以把網址改改；

2.這個爬蟲有一點問題，就是沒用多線程，因為我用了之后總是會報： 'module' object has no attribute '_strptime'這個奇怪的錯誤，掙扎了許久，放棄；

如有大神看到這篇帖子，希望可以指點一二，不勝感激，謝謝。

3.我本來打算做成EXE文件的，但是在中文處理方面總是亂碼，需要進一步研究；

以下為代碼：

#!/usr/bin/python
# -*- encoding:utf-8 -*-

import requests
from bs4 import BeautifulSoup
from multiprocessing.dummy import Pool as ThreadPool
import re
import datetime
import sys
# from datetime import datetime
reload(sys)
sys.setdefaultencoding('utf-8')
#得到soup，因后文通用，直接放這兒就行了
def urlBS(url):
    response=requests.get(url)
    response.encoding = 'utf-8'
    soup = BeautifulSoup(response.text,"lxml")
    return soup

#通過交互的方式讓搜索人輸入想要的房屋條件，不輸的話有默認值
def get_source_url():
    base_url='http://cd.58.com/zufang/'   #首先，鎖定為整租:/zufang/,然后限定為個人房源：/0/,0為個人，1為經紀人
    # real_url='http://cd.58.com/zufang/?isreal=true'
    try:
        source_key=input('請按序號輸入你想要的房屋來源，1為不限，2為個人房源，3為經紀人(默認為2個人房源):\n')
    except:
        source_key=2
    source_from={1:'',2:'0/',3:'1/'}    # 4:'?isreal=true'，4為誠信房源專區'
    try:
        price_min=str(input('請輸入你期望的價格下限(不輸默認為500)：\n'))
    except:
        price_min='500'
    try:
        price_max=str(input('請輸入你期望的價格上限(不輸默認為1000)：\n'))
    except:
        price_max='1000'
    price='minprice='+price_min+'_'+price_max
    try:
        room_key=input('請輸入你想要的房子間數：0為不限，1為1室，2為2室，3為3室，4為4室，5為4室以上(不輸默認為1室):\n')
    except:
        room_key=1
    room_num={0:'',1:'j1/',2:'j2/',3:'j3/',4:'j4/',5:'j5/'}

    key_words=raw_input('請輸入你想要搜索的其他關鍵詞，如小區名稱，地鐵位置等(不輸默認為空)：\n')

    source_url=base_url+source_from[source_key]+room_num[room_key]+'?'+price+'&key='+key_words
    # print new_url
    return source_url
# new_url='http://cd.58.com/zufang/0/j1/?minprice=600_800&PGTID=0d300008-0006-6cd9-6ba7-a7672ec996c3&ClickID=3'
#找到下一頁的地址，因為58的網站很坑，它並沒有顯示共多少頁，所以只能通過爬取他的下一頁對應的href來得到下一頁的鏈接
#但是，更坑的是，他的頁面進去后第一次的下一頁按了后和當前頁是一樣的，所以我就在確定有下一頁的情況下，直接用當前頁+1得到下一頁的url
def get_new_list(source_url):
    new_url=source_url
    new_url_list=[new_url]
    while True:
        soup=urlBS(new_url)
        cp=re.compile(r'/pn(.)/')
        finder=soup.find('a',{'class':'next'})
        if finder:
            next_url=finder['href']
            now_page=cp.findall(source_url)
            next_page='http://cd.58.com'+next_url
            if now_page:
                now_page=now_page[0]
                newpage=str(int(now_page)+1)
                new_page=cp.sub(newpage,next_page)
            else:
                now_page='1'
                newpage='2'
                new_page='http://cd.58.com'+next_url
            new_url=new_page

        else:
            new_page=''
            break
        # else:
        #     print 'dont have next page'
        # print new_url

        if len(new_url_list)==1:
            new_url_list.append(new_url)
        elif new_page==new_url_list[-1]:
            break
        else:
            new_url_list.append(new_url)

    # print new_url_list
    return new_url_list
#得到房屋信息頁的鏈接
def get_house_url(new_url):
    soup = urlBS(new_url)
    href_list=soup.select('div[class="img_list"] a')
    house_url_list=[]
    for each in href_list:
        href=each['href']
        #print href
        house_url_list.append(href)
    return house_url_list
#爬取房屋信息，同時不要那些騙子的信息，以及一個月前更新的信息
def house_info(house_url):
   # house_url='http://cd.58.com/zufang/26364594760504x.shtml?version=A&psid=162154127192148068945806804&entinfo=26364594760504_0'
   # print house_url
    soup=urlBS(house_url)
    try:
        tel=soup.find('span',{'class':'tel-num tel-num-geren pl30 f30'}).text       #個人房源
    except:
        tel=soup.find('span',{'class':'tel-num pl30 f30'}).text                     #中介

    match_tel=re.search(r'^1\d{5}.*',tel) #排除所有電話號碼以0開始的人，即留固定電話的人，因為我們認為，固定房源的人是不會留固定電話的
    situation=soup.find('div',{'class':'description-content'}).text.strip()
    # print situation
    match_si=re.search(u'(我是房東|男士勿擾|男生勿擾|限女生|微信|男士|男性|男生|女性|女的|姐妹|"+")',situation)
    #更新時間
    update_time=soup.find('span',{'class':'pl10'}).text
    update_date = datetime.datetime.strptime(update_time.replace('更新時間：',''), "%Y-%m-%d").date()
    thirtyDayAgo=datetime.date.today() + datetime.timedelta(days=-30)
    day_line=(update_date - thirtyDayAgo).days

    if not match_tel:   #認為隱藏了電話號碼的，電話號碼以0開始的，都是騙子，不要他
        # print '電話號碼有問題，騙子'
        pass
    elif match_si:      #認為含有某些字的全部為騙子，把這些排除掉
        # print '內容有問題，騙子'
        pass
    elif day_line<0:    #取近30天更新的數據，時間太長了的數據沒啥意義
        # print '已經是一個月之前的消息了'
        pass
    else:
        print house_url
        print situation
        print tel
        #標題
        title=soup.find('h1',{'class':'main-title font-heiti'}).text
        print title
        #價格
        p=re.compile(r'\n|\t|\r| ')
        rent_price=soup.find('i',{'class':'ncolor'}).text
        price=p.sub('',rent_price)
        print price
        #房屋大小
        house=soup.find_all('li',{'class':'house-primary-content-li clearfix'})

        house_content=p.sub('',house[0].text)
        print house_content
        #小區
        try:
            house_Community=p.sub('',house[1].text)
        except:
            house_Community=''
        print house_Community
        #位置
        try:
            house_place=p.sub('',house[2].text)
        except:
            house_place=''
        print house_place
        #設施
        try:
            facility=soup.find('li',{'class':'house-primary-content-li clearfix person-config'})
            facility=p.sub('',facility.text)
        except:
            facility=''
        print facility
        #聯系人
        contact=soup.find('li',{'class':'house-primary-content-li clearfix person-contact'}).text
        contact=p.sub('',contact)
        print contact
        print update_time+'\n\n\n'
        # a=[house_url,price,house_content,house_Community,house_place,title,situation,facility]
        f.write('----------------------------------------------------------------------------------\n')
        f.write(house_url+'\n'+price+'\n'+house_content+'\n'+house_Community+'\n'+house_place+'\n'+title+'\n'+situation+'\n'+facility+'\n\n')

if __name__=='__main__':
    source_url=get_source_url()
    print source_url
    # source_url='http://cd.58.com/zufang/0/?minprice=500_1500&key=四河'
    get_new_list=get_new_list(source_url)
    # print get_new_list

    f=open("house_rent.txt", "w")        #先清空，然后再打開，再寫入，寫入時的方式是a(追加)
    # f.truncate()
    # f.close()
    #
    # f=open("house_rent.text", "a")

    print '正在下載，請稍候。。。\n\n'
    # pool = ThreadPool(4)
    for new_url in get_new_list:
        new_url=new_url.encode('utf-8').decode('utf-8')
        # print new_url
        house_url_list=get_house_url(new_url)
        # print house_url_list
        for each in house_url_list:     #本來打算使用多線程，但是總是會報： 'module' object has no attribute '_strptime'這個奇怪的錯誤，掙扎了許久，放棄
            house_info(each)
    #     results = pool.map(house_info, house_url_list)
    # pool.close()
    # pool.join()

    f.close()

免責聲明！

本站轉載的文章為個人學習借鑒使用，本站對版權不負任何法律責任。如果侵犯了您的隱私權益，請聯系本站郵箱yoyou2525@163.com刪除。

猜您在找 爬蟲實戰爬取58同城房源信息 python爬蟲項目(scrapy-redis分布式爬取房天下租房信息) 使用爬蟲scrapy庫爬取58同城出租房的聯系方式地址 python 爬蟲入門案例----爬取某站上海租房圖片 58同城二手車數據爬蟲——數字加密解碼（Python原創） python3 爬蟲教學之爬取鏈家二手房（最下面源碼） //以更新源碼 Python開發爬蟲之BeautifulSoup解析網頁篇：爬取安居客網站上北京二手房數據爬取鏈家租房信息我的第一個爬蟲，爬取北京地區短租房信息爬蟲(成都58同城所有房價,Python實現)