百度貼吧爬蟲實現

　　GET請求

from urllib import request
import urllib
import time

# https://tieba.baidu.com/f?kw=python&fr=ala0&tpl=5    #第一頁 
# https://tieba.baidu.com/f?kw=python&ie=utf-8&pn=50 #第二頁 (2-1)*50
# https://tieba.baidu.com/f?kw=python&ie=utf-8&pn=100 #第三頁 (3-1)*50    
# https://tieba.baidu.com/f?kw=python&ie=utf-8&pn=150 #第四頁 (4-1)*50
# 第n頁    (n-1)*50 
# 推測第一頁：https://tieba.baidu.com/f?kw=python&ie=utf-8&pn=0 

headers={
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) \
AppleWebKit/537.36 (KHTML, like Gecko) \
Chrome/81.0.4044.122 Safari/537.36"
}

#根據url發送請求，獲取服務器響應文件
def loadPage(url,filename):
    print("正在下載"+filename)
    req=request.Request(url,headers=headers)
    return request.urlopen(req).read()

#將HTML內容寫到本地
def writePage(html,filename):
    print("正在保存"+filename)
    with open(filename,"wb") as f:
        f.write(html)
    print("---------------------------")    


def tiebaSpider(url,begin,end):
    for page in range(begin,end+1):
        pn=(page-1)*50
        fullurl=url+"&pn="+str(pn) #每次請求的url
        filename="D:/貼吧/第"+str(page)+"頁.html" #每次請求后保存的文件名

        html=loadPage(fullurl,filename) #調用爬蟲，爬取網頁信息
        writePage(html,filename) #寫入本地


if __name__=='__main__':
    while(True):
        kw=input("請輸入字條：")
        begin=int(input("請輸入起始頁："))
        end=int(input("請輸入結束頁："))

        url="http://tieba.baidu.com/f?"
        key=urllib.parse.urlencode({"kw":kw})
        url=url+key
        tiebaSpider(url,begin,end)

免責聲明！

本站轉載的文章為個人學習借鑒使用，本站對版權不負任何法律責任。如果侵犯了您的隱私權益，請聯系本站郵箱yoyou2525@163.com刪除。

猜您在找 Python爬蟲實踐，獲取百度貼吧內容 Python簡易爬蟲爬取百度貼吧圖片爬蟲下載百度貼吧圖片 Python爬蟲實戰之如何爬取百度貼吧帖子？案例詳解 Python爬蟲實例（一）爬取百度貼吧帖子中的圖片利用python的爬蟲技術爬取百度貼吧的帖子 1、Python request（爬蟲-百度翻譯） python爬蟲百度翻譯 Python爬蟲-百度模擬登錄（二） Scrapy項目 - 實現百度貼吧帖子主題及圖片爬取的爬蟲設計