百度貼吧爬蟲實現
GET請求
from urllib import request import urllib import time # https://tieba.baidu.com/f?kw=python&fr=ala0&tpl=5 #第一頁 # https://tieba.baidu.com/f?kw=python&ie=utf-8&pn=50 #第二頁 (2-1)*50 # https://tieba.baidu.com/f?kw=python&ie=utf-8&pn=100 #第三頁 (3-1)*50 # https://tieba.baidu.com/f?kw=python&ie=utf-8&pn=150 #第四頁 (4-1)*50 # 第n頁 (n-1)*50 # 推測第一頁:https://tieba.baidu.com/f?kw=python&ie=utf-8&pn=0 headers={ "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) \ AppleWebKit/537.36 (KHTML, like Gecko) \ Chrome/81.0.4044.122 Safari/537.36" } #根據url發送請求,獲取服務器響應文件 def loadPage(url,filename): print("正在下載"+filename) req=request.Request(url,headers=headers) return request.urlopen(req).read() #將HTML內容寫到本地 def writePage(html,filename): print("正在保存"+filename) with open(filename,"wb") as f: f.write(html) print("---------------------------") def tiebaSpider(url,begin,end): for page in range(begin,end+1): pn=(page-1)*50 fullurl=url+"&pn="+str(pn) #每次請求的url filename="D:/貼吧/第"+str(page)+"頁.html" #每次請求后保存的文件名 html=loadPage(fullurl,filename) #調用爬蟲,爬取網頁信息 writePage(html,filename) #寫入本地 if __name__=='__main__': while(True): kw=input("請輸入字條:") begin=int(input("請輸入起始頁:")) end=int(input("請輸入結束頁:")) url="http://tieba.baidu.com/f?" key=urllib.parse.urlencode({"kw":kw}) url=url+key tiebaSpider(url,begin,end)
