一、如何识别加速乐
使用加速乐的网站,在没有cookie的情况下首次访问返回的状态码是521,或者看cookie组成,cookie含有jsl_uid的网站基本上就是加速乐。
二、js分析
新版加速乐主要分为两层,第一层通过请求头中的setcookie和请求返回的js代码生成的cookie拼接就可以过掉。第二层稍微麻烦一
些,返回结果是混淆后的js,通过分析解混淆后的js代码可以发现,第二层主要是通过对go里面提供的参数值进行hash加密,然后用加
密后的字符串与go里的一个参数值进行对比,如果通过则返回部分目标cookie,最后再和第一层的jsl_uid进行拼接,生成目标cookie。
def parse_jsl(self, response): ''' :param response: :return: 拿到第一个cookie,再请求获得未解混淆的加速乐js代码 ''' resp_body = response.text resp_meta = copy.deepcopy(response.meta) resp_url = response.url try: cookie_list = response.headers.getlist("Set-Cookie") cookie = [i.decode() for i in cookie_list] cookie = cookie[0].split(";")[0] + ";" self.page_header["cookie"] = cookie jsl_js = re.findall(r'document.cookie=(.*?);location', resp_body) js_ = "function a(){return %s}" % jsl_js[0] result = pyv8_engine_service(js_, functionName='a').split('||') jsl_cookie1 = result[0].split(";")[0] + ';' self.page_header["cookie"] += jsl_cookie1 yield scrapy.Request(url=resp_url, headers=self.page_header, callback=self.parse_jsl_detail, dont_filter=True, meta={**resp_meta, "resp_url": resp_url}) except: traceback.format_exc() self.logger.error(f"parse error and the url is:{resp_url}") def parse_jsl_detail(self, response): ''' :param response: :return: 获得第二个cookie,请求获得正常页面 ''' resp_body = response.text resp_meta = copy.deepcopy(response.meta) resp_url = response.url go_param = re.findall(r'(};go\(\{.*?\}\))', resp_body)[0] resp_body = resp_body.replace("<script>", "").replace("</script>", "") return_body = "return " + re.findall(r"document\[.*?\]=(.*);location", resp_body)[0] resp_body = re.sub(r'(};go\(\{.*?\}\))', "", resp_body) go_param = go_param.replace('"vt":"3600","wt":"1500"', '"vt":"0","wt":"0"') real_jsl_js = """ document = { "cookie": "" } location = { hash: "", host: "www.miit.gov.cn", hostname: "www.miit.gov.cn", href: "https://www.miit.gov.cn/gxsj/tjfx/zh/index.html", origin: "https://www.miit.gov.cn", pathname: "/gxsj/tjfx/zh/index.html", port: "", protocol: "https:", } window = { "navigator": { "userAgent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.182 Safari/537.36" }, "document": document, "location": location } """ + resp_body + return_body + go_param[:2] + "function aaa(){ return " + go_param[2:] + "}" try: resp_list = pyv8_engine_service(real_jsl_js, functionName='aaa').split('||') cookie = resp_list[0] jsl_result = cookie.split(";")[0] self.page_header["cookie"] = re.sub(r'__jsl_clearance_s=.*?;', jsl_result + ";", self.page_header["cookie"]) detail_url = "https://www.miit.gov.cn/api-gateway/jpaas-publish-server/front/page/build/unit?parseType=bulidstatic&webId=8d828e408d90447786ddbe128d495e9e&tplSetId=209741b2109044b5b7695700b2bec37e&pageType=column&tagId=%E5%8F%B3%E4%BE%A7%E5%86%85%E5%AE%B9&editType=null&pageId=ebeccdcd21bc4eeb9655a8890e87c04c" yield scrapy.Request(url=detail_url, headers=self.page_header, callback=self.parse_list, dont_filter=True, meta={**resp_meta, "resp_url": resp_url}) except: traceback.format_exc() self.logger.error(f"parse error and the url is:{resp_url}")
三、注意点
(1)使用代理的时候,进行一次性请求时注意使用同一个ip(is_not_change_proxy = True )