爬取音樂資源
實現
#python 的正則庫 import re #python 的requests庫 import requests import time #找到url的規律 #每一頁的url # http://www.htqyy.com/top/hot # http://www.htqyy.com/top/musicList/hot?pageIndex=1&pageSize=20 # http://www.htqyy.com/top/musicList/hot?pageIndex=2&pageSize=20 #歌曲連接 # http://www.htqyy.com/play/33 # 33-每個歌曲的號碼,頁url可以找到 #資源所在url # http://f2.htqyy.com/play8/33/mp3/6 #class="num">41</span><span class="title"><a href="/play/46" target="play" title="琵琶語" sid="46">琵琶語</a></span> songName=[] songID=[] headers={ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) \ AppleWebKit/537.36 (KHTML, like Gecko) \ Chrome/81.0.4044.138 Safari/537.36" } page=2 #page=int(input("請輸入您要爬取的頁數:")) for i in range(0,page): url="http://www.htqyy.com/top/musicList/hot?pageIndex="+str(i)+"&pageSize=20" #發送get請求,獲取音樂榜單網頁信息 r=requests.get(url,headers=headers) #GBK網頁采用的編碼格式 r.encoding='GBK' html_text=r.text print(html_text) #正則找到對應歌的url part1=r'title="(.*?)" sid=' part2=r'sid="(.*?)"' #將匹配的字串組成列表形式返回 titlelist=re.findall(part1,html_text) idlist=re.findall(part2,html_text) #在一個列表尾添加另一個列表 songName.extend(titlelist) songID.extend(idlist) for i in range(0,len(songID)): songurl="http://f2.htqyy.com/play8/"+str(songID[i])+"/mp3/6" songname=songName[i] #二進制文件 data=requests.get(songurl).content print("正在下載...") with open("E:\\music\\{0}.mp3".format(songname),"wb") as f: f.write(data) time.sleep(5)
當無法訪問試試下面代碼
headers={ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) \ AppleWebKit/537.36 (KHTML, like Gecko) \ Chrome/81.0.4044.138 Safari/537.36" } songurl="http://f2.htqyy.com/play8/33/mp3/6" songname="清風" #二進制文件 data=requests.get(songurl,headers=headers).content print("正在下載...") with open("D:\\Python\\{0}.mp3".format(songname),"wb") as f: f.write(data)
總結
當得到的網頁信息是亂碼:
print requests.get(url).encoding 打印獲取到的網頁信息采用什么編碼
r = requests.get(url)
r.encoding = 'GBK'
print(r.text) 將編碼格式采用'GBK',網頁編碼,就不會出現亂碼
字符串拼接:
+或者format()