Python實例---爬取下載喜馬拉雅音頻文件

本文轉載自查看原文 2018-09-09 00:44 4704 Python實例/ Python

PyCharm下python爬蟲准備

打開pycharm

點擊設置

點擊項目解釋器，再點擊右邊+號

搜索相關庫並添加，例如：requests

喜馬拉雅全網遞歸下載

打開谷歌/火狐瀏覽器，按F12打開開發者工具—>選擇【網絡】

編輯器瀏覽器輸入： https://www.ximalaya.com/yinyue/ 點擊【搖滾】

發現彈出新的URL：https://www.ximalaya.com/yinyue/yaogun/ [漢字轉換拼音后的URL訪問]

點擊進入任意一個專輯[未播放]

點擊播放音樂[播放中]

[是一個json格式的URL]訪問搜索界面的源代碼，查找albumId，通過這些albumid獲取音頻文件的url

https://www.ximalaya.com/revision/play/album?albumId=16372952&pageNum=1&sort=-1&pageSize=30

最后使用函數urllib.request.urlretrieve（）下載音樂即可

附源碼：

import re
import os
import json
import requests
import urllib
from urllib import request
from pypinyin import lazy_pinyin


class XimaLaya(object):
    # 模擬瀏覽器操作
    def __init__(self):
        self.header = {
            "User-Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:60.0) Gecko/20100101 Firefox/60.0'
        }

    # 第一步： 根據輸入的漢字轉換為拼音
    def han_pinyin(self, hanzi):
        pin = lazy_pinyin(hanzi)   # 漢轉拼音
        pin = "".join(pin)       # 去除空格
        return pin               # 返回拼音

    # 第二步： 根據REST格式去訪問喜馬拉雅,獲取頁面的HTML
    def getHtml(self, pinyin):
        url = 'https://www.ximalaya.com/yinyue/' + pinyin
        print("訪問的網站是： " + url)
        html = requests.get(url, headers=self.header)
        # apparent_encoding通過調用chardet.detect()來識別文本編碼，有些消耗計算資源
        html.encoding = html.apparent_encoding
        # html.encoding = 'utf8'  --> 直接改為UTF8也行
        with open('D:\XiMaLaya\html\\' + str(pinyin + '.html'), 'a', encoding='utf-8') as f:
            f.write(html.text)
        return html

    # 第三步：根據頁面的內容獲取對應歌單的albumId的值
    def getAlbumId(self, html):
        albumIdAll = re.findall(r'"albumId":(.*)', (html).text)  # 利用正則進行匹配,獲取專輯ID
        print("專輯信息", albumIdAll)
        with open('D:\XiMaLaya\\albumIdAll\\' + str('albumIdAll.txt'), 'a', encoding='utf-8') as f:
            for x in albumIdAll:
                f.write(str(x))
        myList = []
        url3 = []
        for i in (albumIdAll[:1]):
            # 獲取對應專輯ID的首頁
            url2 = 'https://www.ximalaya.com/revision/play/album?albumId=' + i
            print(url2)
            # 進入對應專輯ID的首頁信息
            html2 = requests.get(url2.split(',')[0], headers=self.header)
            # 含有下載URL的集合
            # src   "http://audio.xmcdn.com/group12/M03/2C/AA/wKgDW1WJ7GqxuItqAB8e1LXvuds895.m4a"
            url3 = (re.findall(r'"src":"(.*?)"', (html2).text))
            # 記錄信息用的
            myList.append('獲取對應專輯ID的首頁\r\n' + url2 + '\n---------------------------------------')
            myList.append('含有下載URL的集合\r\n' + html2.text + '\n---------------------------------------')
            myList.append('下載專輯的URL集合\r\n' + str(url3) + '\n---------------------------------------')
            with open('D:\XiMaLaya\\albumIdAll\\' + str('hhh.txt'), 'a', encoding='utf-8') as f:
                f.write(json.dumps(myList))
        print('done')
        return url3    # 下載專輯的URL集合

    # 第四步： 獲取專輯名
    def getTitle(self, html):
        t = re.findall(r'"title":"(.*?)"', (html).text)  # 獲取titile（歌名）的值
        with open('D:\XiMaLaya\\albumIdAll\\' + str('albumId_Name.txt'), 'a', encoding='utf-8') as f:
            f.write(str(t))
        return t

    # 第五步： 下載歌曲
    def downLoad(self, url, title):
        n = 0
        for i in url:
            try:
                urllib.request.urlretrieve(i, 'D:\XiMaLaya\\'+str(title[n]+'.m4a'))
                print(str(title[n]), '...【下載成功】')
                n = n + 1
            except:
                print(str(title[n]) + "...【下載失敗】")


if __name__ == '__main__':

    fm = XimaLaya()
    # 輸入需要下載的歌曲類型
    str1 = fm.han_pinyin("搖滾")
    # 獲取對應歌曲類型的首頁信息
    html = fm.getHtml(str1)
    # 獲取歌曲類型的首頁里面的專輯名稱
    title = fm.getTitle(html)
    # 獲取歌曲類型的首頁里面的專輯ID
    url3 = fm.getAlbumId(html)
    # 下載對應曲目
    fm.downLoad(url3, title)

喜馬拉雅單一專輯的下載

打開谷歌/火狐瀏覽器，按F12打開開發者工具—>選擇【網絡】

編輯器瀏覽器輸入： https://www.ximalaya.com/yinyue/12521114/

點擊計入音樂[未播放前]

點擊進入音樂[播放中]

[是一個json格式的URL]訪問搜索界面的源代碼，查找albumId，通過這些albumid獲取音頻文件的url

https://www.ximalaya.com/revision/play/album?albumId=12521114&pageNum=1&sort=-1&pageSize=30

最后使用函數urllib.request.urlretrieve（）下載音樂即可

附源碼：

import re
import json
import requests
import urllib
from urllib import request


class XimaLaya(object):
    # 模擬瀏覽器操作
    def __init__(self):
        self.header = {
            "User-Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:60.0) Gecko/20100101 Firefox/60.0'
        }

    # 第一步：根據REST格式去訪問喜馬拉雅,獲取頁面的HTML
    def getHtml(self, pinyin):
        url = 'https://www.ximalaya.com/yinyue/' + pinyin
        print("訪問的網站是： " + url)
        html = requests.get(url, headers=self.header)
        # apparent_encoding通過調用chardet.detect()來識別文本編碼，有些消耗計算資源
        html.encoding = html.apparent_encoding
        # html.encoding = 'utf8'  --> 直接改為UTF8也行
        print(html)
        return html

    # 第二步：根據頁面的內容獲取對應歌單的albumId的值
    def getAlbumId(self, html):
        albumIdAll = re.findall(r'"albumId":(.*)', (html).text)  # 利用正則進行匹配,獲取專輯ID
        print("專輯信息", albumIdAll)
        with open('D:\XiMaLaya\\albumIdAll\\' + str('albumIdAll.txt'), 'a', encoding='utf-8') as f:
            for x in albumIdAll:
                f.write(str(x))
        myList = []
        url3 = []
        for i in (albumIdAll[:1]):
            # 獲取對應專輯ID的首頁
            url2 = 'https://www.ximalaya.com/revision/play/album?albumId=' + i
            print(url2)
            # 進入對應專輯ID的首頁信息
            html2 = requests.get(url2.split(',')[0], headers=self.header)
            # 含有下載URL的集合
            # src   "http://audio.xmcdn.com/group12/M03/2C/AA/wKgDW1WJ7GqxuItqAB8e1LXvuds895.m4a"
            url3 = (re.findall(r'"src":"(.*?)"', (html2).text))
            # 記錄信息用的
            myList.append('獲取對應專輯ID的首頁\r\n' + url2 + '\n---------------------------------------')
            myList.append('含有下載URL的集合\r\n' + html2.text + '\n---------------------------------------')
            myList.append('下載專輯的URL集合\r\n' + str(url3) + '\n---------------------------------------')
            with open('D:\XiMaLaya\\albumIdAll\\' + str('hhh.txt'), 'a', encoding='utf-8') as f:
                f.write(json.dumps(myList))
        print('done')
        return url3    # 下載專輯的URL集合

    # 第三步： 獲取專輯名
    def getTitle(self, html):
        t = re.findall(r'"title":"(.*?)"', (html).text)  # 獲取titile（歌名）的值
        with open('D:\XiMaLaya\\albumIdAll\\' + str('albumId_Name.txt'), 'a', encoding='utf-8') as f:
            f.write(str(t))
        return t

    # 第四步： 下載歌曲
    def downLoad(self, url, title):
        n = 0
        for i in url:
            try:
                urllib.request.urlretrieve(i, 'D:\XiMaLaya\\'+str(title[n]+'.m4a'))
                print(str(title[n]), '...【下載成功】')
                n = n + 1
            except:
                print(str(title[n]) + "...【下載失敗】")


if __name__ == '__main__':

    fm = XimaLaya()
    # 輸入需要下載的歌曲URL
    str1 = "yinyue/12521114/"
    # 獲取對應歌曲類型的首頁信息
    html = fm.getHtml(str1)
    # 獲取歌曲類型的首頁里面的專輯名稱
    title = fm.getTitle(html)
    # 獲取歌曲類型的首頁里面的專輯ID
    url3 = fm.getAlbumId(html)
    # 下載對應曲目
    fm.downLoad(url3, title)

免責聲明！

本站轉載的文章為個人學習借鑒使用，本站對版權不負任何法律責任。如果侵犯了您的隱私權益，請聯系本站郵箱yoyou2525@163.com刪除。

猜您在找 Python 爬取喜馬拉雅音頻【教程】如何把喜馬拉雅音頻下載到電腦傻瓜式下載“喜馬拉雅”音頻文件喜馬拉雅音頻下載器 V1.2 支持專輯批量下載喜馬拉雅mp3下載導出喜馬拉雅下載器 Python中使用requests和parsel爬取喜馬拉雅電台音頻喜馬拉雅 FM 已購付費音頻下載 Python爬蟲 -- 喜馬拉雅爬蟲01 Python 視頻、圖片、音頻爬取下載 you-get Python爬蟲之JavaScript逆向，喜馬拉雅加密算法分析喜馬拉雅FM抓包之旅