Python爬蟲-百度貼吧


百度貼吧爬蟲實現

  GET請求

from urllib import request
import urllib
import time

# https://tieba.baidu.com/f?kw=python&fr=ala0&tpl=5    #第一頁 
# https://tieba.baidu.com/f?kw=python&ie=utf-8&pn=50 #第二頁 (2-1)*50
# https://tieba.baidu.com/f?kw=python&ie=utf-8&pn=100 #第三頁 (3-1)*50    
# https://tieba.baidu.com/f?kw=python&ie=utf-8&pn=150 #第四頁 (4-1)*50
# 第n頁    (n-1)*50 
# 推測第一頁:https://tieba.baidu.com/f?kw=python&ie=utf-8&pn=0 

headers={
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) \
AppleWebKit/537.36 (KHTML, like Gecko) \
Chrome/81.0.4044.122 Safari/537.36"
}

#根據url發送請求,獲取服務器響應文件
def loadPage(url,filename):
    print("正在下載"+filename)
    req=request.Request(url,headers=headers)
    return request.urlopen(req).read()

#將HTML內容寫到本地
def writePage(html,filename):
    print("正在保存"+filename)
    with open(filename,"wb") as f:
        f.write(html)
    print("---------------------------")    


def tiebaSpider(url,begin,end):
    for page in range(begin,end+1):
        pn=(page-1)*50
        fullurl=url+"&pn="+str(pn) #每次請求的url
        filename="D:/貼吧/第"+str(page)+"頁.html" #每次請求后保存的文件名

        html=loadPage(fullurl,filename) #調用爬蟲,爬取網頁信息
        writePage(html,filename) #寫入本地


if __name__=='__main__':
    while(True):
        kw=input("請輸入字條:")
        begin=int(input("請輸入起始頁:"))
        end=int(input("請輸入結束頁:"))

        url="http://tieba.baidu.com/f?"
        key=urllib.parse.urlencode({"kw":kw})
        url=url+key
        tiebaSpider(url,begin,end)

 


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM