爬蟲實現qq音樂歌單無vip批量下載
分享歌單鏈接
電腦網頁無法獲取歌單完信息,所以需要借助手機下載網頁文件
利用下載網站實現批量下載
music.py
import requests
from fake_useragent import UserAgent
from lxml import html
from selenium import webdriver
from time import sleep
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
# 讀取qq音樂分享文件
def get_html_file():
file = input('請輸入html文件地址(項目目錄下則直接輸入文件名包括后綴名):')
with open(file, 'r', encoding='utf-8') as f:
html_ = f.read()
return html_
# 從html中獲取歌曲的信息(歌名和歌手)
def get_music_name_and_singer(html_):
etree = html.etree
e = etree.HTML(html_)
# 不同歌曲歌單
# music_infos = e.xpath('//p[@class="song_list__desc"]/text()')
# 同一作者的歌單
music_infos = e.xpath('//span[@class="song_list__txt"]/text()')
# print(music_info)
return music_infos
# 從下載網站獲取歌曲下載地址
def get_download_url(music_info):
url = 'https://www.musictool.top/?name={}&type=qq'
url = url.format(str(music_info))
option = webdriver.ChromeOptions()
option.add_argument('headless')
dcap = dict(DesiredCapabilities.CHROME)
dcap['chrome.page.settings.userAgent'] = UserAgent().chrome
driver = webdriver.Chrome(chrome_options=option, desired_capabilities=dcap)
driver.get(url)
sleep(6)
response = driver.page_source
# print(response)
etree = html.etree
e = etree.HTML(response)
if ((e.xpath('//a[@id="j-src-btn"]/@href'))) == None:
return None
else:
download_url = ''.join(e.xpath('//a[@id="j-src-btn"]/@href'))
print(download_url)
return download_url
# 下載歌曲,保存歌曲
def download_music(download_url, name):
if download_url == None:
print(name + '---下載失敗')
headers = {
'User-Agent': UserAgent().random
}
response = requests.get(download_url, headers=headers)
if response.status_code == 200:
with open('music/' + name + '.mp3', 'wb') as f:
f.write(response.content)
print('下載完成---' + name)
print('--------------------')
else:
print(name + '---下載失敗')
# 主方法,遍歷歌曲信息執行下載歌曲
def main():
html_ = get_html_file()
music_infos = get_music_name_and_singer(html_)
print(music_infos)
for music_info in music_infos:
print('開始下載---' + music_info)
# music_info = '馬良/孫茜茹 往后余生'
download_url = get_download_url(music_info)
music_info = music_info.replace(' · ', ' ').replace('/', '_').replace('?', '').strip()
download_music(download_url, music_info)
if __name__ == '__main__':
main()
說明:
1.歌曲下載網站使用的時Ajax異步請求,所以不能通過常規方法的爬取歌曲
2.采用selenuim來獲取網頁完整的代碼,從而獲取歌曲的下載地址
3.爬取過程中ua很重要的,一定要設置
4.訪問速度一定不要太快,所以采用了sleep()方法來減慢爬取速度,防止被檢測到電腦操作,從而報錯