百度翻譯爬蟲-Web版(自動生成sign)


 1 # 面向對象
 2 # 百度翻譯 -- 網頁版(自動獲取token,sign)
 3 import requests
 4 import js2py
 5 import json
 6 import re
 7 
 8 
 9 class WebFanyi:
10     """百度翻譯網頁版爬蟲"""
11     def __init__(self,query_str):
12         self.session = requests.session()
13         headers = {
14             "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36",
15         }
16         self.session.headers = headers
17         self.baidu_url = "https://www.baidu.com/"
18         self.root_url = "https://fanyi.baidu.com/"
19         self.lang_url = "https://fanyi.baidu.com/langdetect"
20         self.trans_url = "https://fanyi.baidu.com/v2transapi"
21         self.query_str = query_str
22 
23     def get_token_gtk(self):
24         '''獲取token和gtk(用於合成Sign)'''
25         self.session.get(self.root_url)
26         resp = self.session.get(self.root_url)
27         html_str = resp.content.decode()
28         token = re.findall(r"token: '(.*?)'", html_str)[0]
29         gtk = re.findall(r"window.gtk = '(.*?)'", html_str)[0]
30         return token,gtk
31 
32     def generate_sign(self,gtk):
33         """生成sign"""
34         # 1. 准備js編譯環境
35         context = js2py.EvalJs()
36         with open('webtrans.js', encoding='utf8') as f:
37             js_data = f.read()
38             js_data = re.sub("window\[l\]",'"'+gtk+'"',js_data)
39             # js_data = re.sub("window\[l\]", "\"{}\"".format(gtk), js_data)
40             # print(js_data)
41             context.execute(js_data)
42         sign = context.e(self.query_str)
43         return sign
44 
45     def lang_detect(self):
46         '''獲取語言轉換類型.eg: zh-->en'''
47         lang_resp = self.session.post(self.lang_url,data={"query":self.query_str})
48         lang_json_str = lang_resp.content.decode()  # {"error":0,"msg":"success","lan":"zh"}
49         lan = json.loads(lang_json_str)['lan']
50         to = "en" if lan == "zh" else "zh"
51         return lan,to
52 
53 
54     def parse_url(self,post_data):
55         trans_resp = self.session.post(self.trans_url,data=post_data)
56         trans_json_str = trans_resp.content.decode()
57         trans_json = json.loads(trans_json_str)
58         result = trans_json["trans_result"]["data"][0]["dst"]
59         print("{}: {}".format(self.query_str,result))
60 
61     def run(self):
62         """實現邏輯"""
63         # 1.獲取百度的cookie,(缺乏百度首頁的cookie會始終報錯998)
64         self.session.get(self.baidu_url)
65         # 2. 獲取百度翻譯的token和gtk(用於合成sign)
66         token, gtk = self.get_token_gtk()
67         # 3. 生成sign
68         sign = self.generate_sign(gtk)
69         # 4. 獲取語言轉換類型.eg: zh-->en
70         lan, to = self.lang_detect()
71         # 5. 發送請求,獲取響應,輸出結果
72         post_data = {
73             "from": lan,
74             "to": to,
75             "query": self.query_str,
76             "transtype": "realtime",
77             "simple_means_flag": 3,
78             "sign": sign,
79             "token": token
80         }
81         self.parse_url(post_data)
82 
83 if __name__ == '__main__':
84     webfanyi = WebFanyi('lover')
85     webfanyi.run()

上述代碼中用於生成sign的 webtrans.js 文件具體代碼如下(可以自己抓包,在js中打斷點獲取):

 1 // webtrans.js
 2 
 3 function n(r, o) {
 4     for (var t = 0; t < o.length - 2; t += 3) {
 5         var a = o.charAt(t + 2);
 6         a = a >= "a" ? a.charCodeAt(0) - 87 : Number(a),
 7         a = "+" === o.charAt(t + 1) ? r >>> a : r << a,
 8         r = "+" === o.charAt(t) ? r + a & 4294967295 : r ^ a
 9     }
10     return r
11 }
12 function e(r) {
13     var o = r.match(/[\uD800-\uDBFF][\uDC00-\uDFFF]/g);
14     if (null === o) {
15         var t = r.length;
16         t > 30 && (r = "" + r.substr(0, 10) + r.substr(Math.floor(t / 2) - 5, 10) + r.substr(-10, 10))
17     } else {
18         for (var e = r.split(/[\uD800-\uDBFF][\uDC00-\uDFFF]/), C = 0, h = e.length, f = []; h > C; C++)
19             "" !== e[C] && f.push.apply(f, a(e[C].split(""))),
20             C !== h - 1 && f.push(o[C]);
21         var g = f.length;
22         g > 30 && (r = f.slice(0, 10).join("") + f.slice(Math.floor(g / 2) - 5, Math.floor(g / 2) + 5).join("") + f.slice(-10).join(""))
23     }
24     var u = void 0
25       ,
26         // l = "" + String.fromCharCode(103) + String.fromCharCode(116) + String.fromCharCode(107);
27         i = null;
28         u = null !== i ? i : (i = window[l] || "") || "";
29     for (var d = u.split("."), m = Number(d[0]) || 0, s = Number(d[1]) || 0, S = [], c = 0, v = 0; v < r.length; v++) {
30         var A = r.charCodeAt(v);
31         128 > A ? S[c++] = A : (2048 > A ? S[c++] = A >> 6 | 192 : (55296 === (64512 & A) && v + 1 < r.length && 56320 === (64512 & r.charCodeAt(v + 1)) ? (A = 65536 + ((1023 & A) << 10) + (1023 & r.charCodeAt(++v)),
32         S[c++] = A >> 18 | 240,
33         S[c++] = A >> 12 & 63 | 128) : S[c++] = A >> 12 | 224,
34         S[c++] = A >> 6 & 63 | 128),
35         S[c++] = 63 & A | 128)
36     }
37     for (var p = m, F = "" + String.fromCharCode(43) + String.fromCharCode(45) + String.fromCharCode(97) + ("" + String.fromCharCode(94) + String.fromCharCode(43) + String.fromCharCode(54)), D = "" + String.fromCharCode(43) + String.fromCharCode(45) + String.fromCharCode(51) + ("" + String.fromCharCode(94) + String.fromCharCode(43) + String.fromCharCode(98)) + ("" + String.fromCharCode(43) + String.fromCharCode(45) + String.fromCharCode(102)), b = 0; b < S.length; b++)
38         p += S[b],
39         p = n(p, F);
40     return p = n(p, D),
41     p ^= s,
42     0 > p && (p = (2147483647 & p) + 2147483648),
43     p %= 1e6,
44     p.toString() + "." + (p ^ m)
45 }

實際上,除了用js2py作為python中執行js代碼的環境編譯器外,還可以使用另一個方法 'execjs' ,不過要先通過 pip install PyExecJS 安裝PyExecJS模塊.具體實現代碼如下:

1 import execjs
2 with open("webtrans.js") as f:
3     js_data = f.read()
4     js_data = re.sub("window\[l\]", '"' + gtk + '"', js_data)
5     sign = execjs.compile(js_data).call("e", query_str)  # 調用webtrans.js代碼中的 e函數,傳入參數為 query_str
6     print(sign)

 


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM