一、如何識別加速樂
使用加速樂的網站,在沒有cookie的情況下首次訪問返回的狀態碼是521,或者看cookie組成,cookie含有jsl_uid的網站基本上就是加速樂。
二、js分析
新版加速樂主要分為兩層,第一層通過請求頭中的setcookie和請求返回的js代碼生成的cookie拼接就可以過掉。第二層稍微麻煩一
些,返回結果是混淆后的js,通過分析解混淆后的js代碼可以發現,第二層主要是通過對go里面提供的參數值進行hash加密,然后用加
密后的字符串與go里的一個參數值進行對比,如果通過則返回部分目標cookie,最后再和第一層的jsl_uid進行拼接,生成目標cookie。
def parse_jsl(self, response): ''' :param response: :return: 拿到第一個cookie,再請求獲得未解混淆的加速樂js代碼 ''' resp_body = response.text resp_meta = copy.deepcopy(response.meta) resp_url = response.url try: cookie_list = response.headers.getlist("Set-Cookie") cookie = [i.decode() for i in cookie_list] cookie = cookie[0].split(";")[0] + ";" self.page_header["cookie"] = cookie jsl_js = re.findall(r'document.cookie=(.*?);location', resp_body) js_ = "function a(){return %s}" % jsl_js[0] result = pyv8_engine_service(js_, functionName='a').split('||') jsl_cookie1 = result[0].split(";")[0] + ';' self.page_header["cookie"] += jsl_cookie1 yield scrapy.Request(url=resp_url, headers=self.page_header, callback=self.parse_jsl_detail, dont_filter=True, meta={**resp_meta, "resp_url": resp_url}) except: traceback.format_exc() self.logger.error(f"parse error and the url is:{resp_url}") def parse_jsl_detail(self, response): ''' :param response: :return: 獲得第二個cookie,請求獲得正常頁面 ''' resp_body = response.text resp_meta = copy.deepcopy(response.meta) resp_url = response.url go_param = re.findall(r'(};go\(\{.*?\}\))', resp_body)[0] resp_body = resp_body.replace("<script>", "").replace("</script>", "") return_body = "return " + re.findall(r"document\[.*?\]=(.*);location", resp_body)[0] resp_body = re.sub(r'(};go\(\{.*?\}\))', "", resp_body) go_param = go_param.replace('"vt":"3600","wt":"1500"', '"vt":"0","wt":"0"') real_jsl_js = """ document = { "cookie": "" } location = { hash: "", host: "www.miit.gov.cn", hostname: "www.miit.gov.cn", href: "https://www.miit.gov.cn/gxsj/tjfx/zh/index.html", origin: "https://www.miit.gov.cn", pathname: "/gxsj/tjfx/zh/index.html", port: "", protocol: "https:", } window = { "navigator": { "userAgent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.182 Safari/537.36" }, "document": document, "location": location } """ + resp_body + return_body + go_param[:2] + "function aaa(){ return " + go_param[2:] + "}" try: resp_list = pyv8_engine_service(real_jsl_js, functionName='aaa').split('||') cookie = resp_list[0] jsl_result = cookie.split(";")[0] self.page_header["cookie"] = re.sub(r'__jsl_clearance_s=.*?;', jsl_result + ";", self.page_header["cookie"]) detail_url = "https://www.miit.gov.cn/api-gateway/jpaas-publish-server/front/page/build/unit?parseType=bulidstatic&webId=8d828e408d90447786ddbe128d495e9e&tplSetId=209741b2109044b5b7695700b2bec37e&pageType=column&tagId=%E5%8F%B3%E4%BE%A7%E5%86%85%E5%AE%B9&editType=null&pageId=ebeccdcd21bc4eeb9655a8890e87c04c" yield scrapy.Request(url=detail_url, headers=self.page_header, callback=self.parse_list, dont_filter=True, meta={**resp_meta, "resp_url": resp_url}) except: traceback.format_exc() self.logger.error(f"parse error and the url is:{resp_url}")
三、注意點
(1)使用代理的時候,進行一次性請求時注意使用同一個ip(is_not_change_proxy = True )