Python爬蟲之JavaScript逆向,喜馬拉雅加密算法分析


前言

這幾天一直聽聽評書,發現喜馬拉雅上的資源很多,不過很可惜都是付費的,所以我沖了一個月會員,簡單寫個爬蟲,爬下來幾10部,夠我一年聽的了

開始分析

打開chrome控制台,點擊播放,最先拿到的一個接口就是

https://mpay.ximalaya.com/mobile/track/pay/244130607/?device=pc

當然這個是付費的一部書,所以如果你瀏覽器不帶 會員的cookie是訪問不到的,其中的數字 244130607,這個在他們的接口中叫做 trackId, 每個音頻文件對應唯一的一個 trackId

Python爬蟲之JavaScript逆向,喜馬拉雅加密算法分析

 

也就是對應這個界面的后面的數字,通過這個唯一的trackId可以獲取到音頻文件,那么看一下這個接口返回的內容

 
         

"""
當然在學習Python的道路上肯定會困難,沒有好的學習資料,怎么去學習呢?
學習Python中有不明白推薦加入交流群號:928946953
群里有志同道合的小伙伴,互幫互助, 群里有不錯的視頻學習教程和PDF!
還有大牛解答!
"""


{
"ret": 0, "msg": "0", "trackId": 244130607, "uid": 170217760, "albumId": 30816438, "title": "《三體》第一季 第十集 聚會與大撕裂", "domain": "http://audiopay.cos.xmcdn.com", "totalLength": 12780565, "sampleDuration": 0, "sampleLength": 0, "isAuthorized": true, "apiVersion": "1.0.0", "seed": 9583, "fileId": "27*31*44*62*1*8*6*48*52*4*6*17*16*6*35*35*6*43*25*27*48*63*58*4*50*47*60*64*15*39*59*49*2*36*48*48*16*58*18*44*2*32*12*7*52*64*51*26*29*4*22*", "buyKey": "617574686f72697a6564", "duration": 1578, "ep": "20NvOoh6T39X3qwKO4cY5g5bVhg+1nfPHIQafFTmCXihnrqF2PjczO8O0auK1KJhDrJ30XMYfKJo2uz+xgwd3rwRPi5f", "highestQualityLevel": 1, "downloadQualityLevel": 1, "authorizedType": 1 }

 

這里,我充會員了,所以可以直接用瀏覽器中打開這個url,其中有用的字段有了只有幾個 seed和 fileId兩個通過js加密算法計算出 m4a的路徑,並拼接主域名,然后 ep 經過另一個加密算法得到url的訪問參數buy_key sign token timestamp,最后將它們拼接到一起才是一個完整的 音頻的url

兩個js加密算法

經過我調試我分別找到了這兩個加密的 js算法

  1. 計算 m4a的路徑js算法:
function vt(t) {
                this._randomSeed = t,
                this.cg_hun()
            }
            vt.prototype = {
                cg_hun: function() {
                    this._cgStr = "";
                    var t = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\\:._-1234567890"
                      , e = t.length
                      , n = 0;
                    for (n = 0; n < e; n++) {
                        var r = this.ran() * t.length
                          , o = parseInt(r);
                        this._cgStr += t.charAt(o),
                        t = t.split(t.charAt(o)).join("")
                    }
                },
                cg_fun: function(t) {
                    t = t.split("*");
                    var e = ""
                      , n = 0;
                    for (n = 0; n < t.length - 1; n++)
                        e += this._cgStr.charAt(t[n]);
                    return e
                },
                ran: function() {
                    this._randomSeed = (211 * this._randomSeed + 30031) % 65536;
                    return this._randomSeed / 65536
                },

            };

c = function(t, e) {
    var n = new vt(t).cg_fun(e);
    return "/" === n[0] ? n : "/".concat(n)
}

console.log(c(9583,"27*31*44*62*1*8*6*48*52*4*6*17*16*6*35*35*6*43*25*27*48*63*58*4*50*47*60*64*15*39*59*49*2*36*48*48*16*58*18*44*2*32*12*7*52*64*51*26*29*4*22*"))

 

用node跑一下可以得到 m4a的路徑
輸出

/group3/M04/9E/88/wKgMbF4ejn2TfGPRAMMEFYoRHXs027.m4a
  1. 通過ep來計算url參數的js算法:
Z = function() {
                throw new TypeError("Invalid attempt to destructure non-iterable instance")
            }

J = function(t, e) {
var n = []
  , r = !0
  , o = !1
  , i = void 0;
try {
    for (var a, u = t[Symbol.iterator](); !(r = (a = u.next()).done) && (n.push(a.value),
    !e || n.length !== e); r = !0)
        ;
} catch (t) {
    o = !0,
    i = t
} finally {
    try {
        r || null == u.return || u.return()
    } finally {
        if (o)
            throw i
    }
}
return n
}

Q = function(t) {
if (Array.isArray(t))
    return t
}

tt = function(t, e) {
    return Q(t) || J(t, e) || Z()
}

function yt(t, e) {
    for (var n, r = [], o = 0, i = "", a = 0; 256 > a; a++)
        r[a] = a;
    for (a = 0; 256 > a; a++)
        o = (o + r[a] + t.charCodeAt(a % t.length)) % 256,
        n = r[a],
        r[a] = r[o],
        r[o] = n;
    for (var u = o = a = 0; u < e.length; u++)
        o = (o + r[a = (a + 1) % 256]) % 256,
        n = r[a],
        r[a] = r[o],
        r[o] = n,
        i += String.fromCharCode(e.charCodeAt(u) ^ r[(r[a] + r[o]) % 256]);
    return i
}

var mt = yt("xm", "Ä[üJ=†Û3áf÷N")
 gt = [19, 1, 4, 7, 30, 14, 28, 8, 24, 17, 6, 35, 34, 16, 9, 10, 13, 22, 32, 29, 31, 21, 18, 3, 2, 23, 25, 27, 11, 20, 5, 15, 12, 0, 33, 26]

bt = function(t) {

var e1 = yt(
    function(t, e) {
    for (var n = [], r = 0; r < t.length; r++) {
        for (var o = "a" <= t[r] && "z" >= t[r] ? t[r].charCodeAt() - 97 : t[r].charCodeAt() - "0".charCodeAt() + 26, i = 0; 36 > i; i++)
            if (e[i] == o) {
                o = i;
                break
            }
        n[r] = 25 < o ? String.fromCharCode(o - 26 + "0".charCodeAt()) : String.fromCharCode(o + 97)
    }
    return n.join("")
    }("d" + mt + "9",gt)
    ,
    e2 = function(t) {
        if (!t)
            return "";
        var e, n, r, o, i, a = [-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 62, -1, -1, -1, 63, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, -1, -1, -1, -1, -1, -1, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1, -1, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, -1, -1, -1, -1, -1];
        for (o = (t = t.toString()).length,
        r = 0,
        i = ""; r < o; ) {
            do {
                e = a[255 & t.charCodeAt(r++)]
            } while (r < o && -1 == e);if (-1 == e)
                break;
            do {
                n = a[255 & t.charCodeAt(r++)]
            } while (r < o && -1 == n);if (-1 == n)
                break;
            i += String.fromCharCode(e << 2 | (48 & n) >> 4);
            do {
                if (61 == (e = 255 & t.charCodeAt(r++)))
                    return i;
                e = a[e]
            } while (r < o && -1 == e);if (-1 == e)
                break;
            i += String.fromCharCode((15 & n) << 4 | (60 & e) >> 2);
            do {
                if (61 == (n = 255 & t.charCodeAt(r++)))
                    return i;
                n = a[n]
            } while (r < o && -1 == n);if (-1 == n)
                break;
            i += String.fromCharCode((3 & e) << 6 | n)
        }
        return i
    }(t)
    ).split("-")

console.log(e1)
}

var c = bt("20NvOoh6T39X3qwKO4cY5g5bVhg+1nfPHIQafFTmCXihnrqF2PjczO8O0auK1KJhDrJ30XMYfKJo2uz+xgwd3rwRPi5f")

 

這段js比較復雜,調試的時候坑死我了,不在同一個地方,導致我來回復制,最終才把這個算法整理到這一個js文件中,依然用 node跑一下,輸出:

[
  '617574686f72697a6564', 'ef9a0678d77870843ef203d6333ce021', '5790', '1598533668' ]

這幾個參數分別對應的是:buy_key sign token timestamp
有了這兩個js算法就可以完全地解析 這個接口返回的參數了。

python 代碼仿寫加密算法

  1. 計算 m4a路徑加密算法
class vt():
    def __init__(self,t):
        self._randomSeed = t
        self.cg_hun()

    def ran(self):
        self._randomSeed = (211 * self._randomSeed + 30031) % 65536
        return self._randomSeed / 65536

    def cg_hun(self):
        self._cgStr = ""
        t = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\\:._-1234567890"
        e = len(t)
        n = 0
        for i in range(e):
            r = self.ran() * len(t)
            o = int(r)
            self._cgStr += t[o]
            t = "".join(t.split(t[o]))

    def cg_fun(self,t):
        t = [int(i) if i else 0 for i in t.split("*")]
        e = ""
        n = 0;
        for n in range(n,len(t)-1):
            e += self._cgStr[t[n]]
        return e

def path_decode(seed,fileId):
    c = vt(seed)
    p = c.cg_fun(fileId)
    return p 

if __name__ == '__main__':
    result = path_decode(9583,"27*31*44*62*1*8*6*48*52*4*6*17*16*6*35*35*6*43*25*27*48*63*58*4*50*47*60*64*15*39*59*49*2*36*48*48*16*58*18*44*2*32*12*7*52*64*51*26*29*4*22*")
    print(result)
通過ep來計算url參數的算法:
def yt(t, e):
    r = [0 for i in range(256)]
    o = 0
    i = ""
    for a in range(0,256):
        r[a] = a;
    for a in range(0,256):
        o = (o + r[a] + ord(t[a % len(t)])) % 256
        n = r[a]
        r[a] = r[o]
        r[o] = n

    u = 0
    o = 0
    a = 0
    for u in range(0,len(e)):
        a = (a + 1) % 256
        o = (o + r[a]) % 256
        n = r[a]
        r[a] = r[o]
        r[o] = n
        i += chr(ord(e[u]) ^ r[(r[a] + r[o]) % 256])
    return i

def bt(t):
    def arg1(t,e):
        n = [' ' for i in range(256)]
        for r in range(0,len(t)):

            if "a" <= t[r] and "z" >= t[r]:
                o = ord(t[r]) - 97 
            else:
                o = ord(t[r]) - ord("0") + 26
            for i in range(0,36):
                if (e[i] == o):
                    o = i
                    break

            if 25< o:
                n[r] = chr(o - 26 + ord("0")) 
            else:
                n[r] = chr(o + 97)

        return "".join(n).strip()

    a1 = arg1("d" + mt + "9", gt)
    def arg2(t):
        if not t:
            return ""

        e = n = r = o = i = a = [-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 62, -1, -1, -1, 63, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, -1, -1, -1, -1, -1, -1, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1, -1, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, -1, -1, -1, -1, -1];

        o = len(t)
        i = ""
        r = 0
        while r < o:
            while True:
                e = a[255 & ord(t[r])]
                r += 1
                if not (r < o and -1 == e):
                    break
            if (-1 == e):
                break
            while True:
                n = a[255 & ord(t[r])]
                r += 1
                if not (r < o and -1 == n):
                    break
            if (-1 == n):
                break
            i += chr(e << 2 | (48 & n) >> 4)
            while True:
                e = (255 & ord(t[r]))
                if 61 == e:
                    return i
                r += 1

                e = a[e]
                if not (r < o and -1 == e):
                    break
            if (-1 == e):
                break
            i += chr((15 & n) << 4 | (60 & e) >> 2);
            while True:
                n = (255 & ord(t[r]))
                if (61 == n):
                    return i
                r += 1
                n = a[n]
                if not (r < o and -1 == n):
                    break
            if (-1 == n):
                break
            i += chr((3 & e) << 6 | n)

        return i

    a2 = arg2(t)
    buy_key,sign,token,timestamp = yt(a1,a2).split('-')
    data = dict(
        buy_key=buy_key,
        sign=sign,
        token=token,
        timestamp=timestamp,
    )
    return data

mt = yt("xm", "Ä[üJ=†Û3áf÷N")
gt = [19, 1, 4, 7, 30, 14, 28, 8, 24, 17, 6, 35, 34, 16, 9, 10, 13, 22, 32, 29, 31, 21, 18, 3, 2, 23, 25, 27, 11, 20, 5, 15, 12, 0, 33, 26]

def ep_decode(ep):
    data = bt(ep)
    return data

if __name__ == '__main__':
    print(ep_decode('20NvOoh6T39X3qwKO4cY5g5bVhg+1nfPHIQafFTmCXihnrqF2PjczO8O0auK1KJhDrJ30XMYfKJo2uz+xgwd3rwRPi5f'))

 

這個接口到此為止才算是完全可以解析。

免費接口分析

如果你沒有充會員,免費的音頻還是可以聽的,我找到一個免費音頻的接口

https://www.ximalaya.com/revision/play/v1/audio?id=324681559&ptype=1
{
"ret": 200,
"data": {
"trackId": 324681559,
"canPlay": true,
"isPaid": false,
"hasBuy": true,
"src": "https://aod.cos.tx.xmcdn.com/group84/M03/4A/A6/wKg5Hl8s0cTwcp6xABQ0EbeuW5Q193.m4a",
"albumIsSample": false,
"sampleDuration": 48,
"isBaiduMusic": false,
"firstPlayStatus": true,
"isVipFree": false
}
}

 

這個接口還是比較簡單的,返回值里面直接包含 m4a音頻地址,沒有加密措施,另外 url中的數字依然是 trackId,值得一提的是免費音頻的trackId不能用在付費接口,我猜測是版本迭代的問題,或者是客戶端不同的問題,因為當時我不只是分析網頁的接口,還抓包了電腦客戶端的接口,具體對應的是網頁還是客戶端我也忘了。

解析整本書的接口

喜馬拉雅接口主要關鍵的有兩個參數,一個是前面我說的 trackId 另一個就是albumId,trackId 對應唯一的一個音頻,而 albumId 對應的是唯一的一本書。

https://www.ximalaya.com/revision/album/v1/getTracksList?albumId=30816438&pageNum=1&pageSize=1000

返回值中就有每一集的trackId,其實喜馬拉雅還有很多其他接口,搜索接口等等,一般的其他的接口需要在請求頭中加入xm-sign,我也寫了xm-sign的計算方法:

import requests
import time
import hashlib
import random
import json
from requests.packages.urllib3.exceptions import InsecureRequestWarning
requests.packages.urllib3.disable_warnings(InsecureRequestWarning)

# 獲取sign簽名

def get_sign(headers):
    serverTimeUrl = "https://www.ximalaya.com/revision/time"
    response = requests.get(serverTimeUrl,headers=headers,verify=False)
    serverTime = response.text
    nowTime = str(round(time.time()*1000))

    sign = str(hashlib.md5("himalaya-{}".format(serverTime).encode()).hexdigest()) + "({})".format(str(round(random.random()*100))) + serverTime + "({})".format(str(round(random.random()*100))) + nowTime
    headers["xm-sign"] = sign
    return headers

def get_header():
    headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36"
    }    
    headers = get_sign(headers)
    return headers

if __name__ == '__main__':
        # 這是一個搜索接口
    url = "https://www.ximalaya.com/revision/search/main?core=all&spellchecker=true&device=iPhone&kw=%E9%9B%AA%E4%B8%AD%E6%82%8D%E5%88%80%E8%A1%8C&page=1&rows=20&condition=relation&fq=&paidFilter=false"
    s = requests.get(url,headers=get_header(),verify=False)
    print(s.json())

 

還有很多其他接口,我就懶得說了,因為我不想寫了,有了這些就可以滿足我下載整本書的需求了

最終整合

我寫了 喜馬拉雅 掃碼登陸的腳本,因為我不能每次都去復制瀏覽器中的 cookie,這種重復勞動太傻了

import requests
import re
from threading import Thread
import time
import requests
from io import BytesIO
import http.cookiejar as cookielib
from PIL import Image
import sys
import psutil
from base64 import b64decode
import os

requests.packages.urllib3.disable_warnings()

class show_code(Thread):
    def __init__(self,data):
        Thread.__init__(self)
        self.data = data

    def run(self):
        img = Image.open(BytesIO(self.data))  # 打開圖片,返回PIL image對象
        img.show()

def is_login(session):
    headers = {'User-Agent':"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36"}
    url = "https://www.ximalaya.com/revision/main/getCurrentUser"
    try:
        session.cookies.load(ignore_discard=True)
    except Exception:
        pass
    response  = session.get(url,verify=False,headers=headers)
    if response.json()['ret'] == 200:
        print(response.json())
        return session,True
    else:
        return session,False

def login():
    if not os.path.exists(".cookie"):
        os.makedirs('.cookie')
    if not os.path.exists('.cookie/xmly.txt'):
        print("hello")
        with open(".cookie/xmly.txt",'w') as f:
            f.write("")
    session = requests.session()
    session.cookies = cookielib.LWPCookieJar(filename='.cookie/xmly.txt')
    session,status = is_login(session)
    if not status:
        url = "https://passport.ximalaya.com/web/qrCode/gen?level=L"
        response = session.get(url,verify=False)
        data = response.json()
        # with open('qrcode.jpg','wb') as f:
            # f.write(b64decode(data['img']))
        t= show_code(b64decode(data['img']))
        t.start()
        qrId = data['qrId']

        url = 'https://passport.ximalaya.com/web/qrCode/check/%s/%s' % (qrId,int(time.time()*1000))
        while 1:
            response = session.get(url,verify=False)
            data = response.json()
            # code = re.findall("window.wx_code='(.*?)'",response.text)
            # sys.exit()

            if data['ret'] == 0:
                # for proc in psutil.process_iter():  # 遍歷當前process
                    # try:
                    #     if proc.name() == "Microsoft.Photos.exe":  
                    #         proc.kill()  # 關閉該process
                    # except Exception as e:
                    #     print(e)
                break
            time.sleep(1)
        session.cookies.save()
    return session
if __name__ == '__main__':
    login()

 

簡單的一個掃碼登陸腳本,如果cookie自動保存成文件,下次使用的時候直接調用:

session = login()

就能在保持登陸狀態下,訪問各種接口


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM