# -*- coding: UTF-8 -*- import os import sys from spiders.market_supervision_penalty.govement_penalty_base_spider import govement_penalty_base_spider from utils.common_util import * import datetime from bs4 import BeautifulSoup from spiders.base_spiders.base_spider import * from urllib.parse import urlencode from config.proxy.config import * from utils.date_util import current_datetime class ah_market_gov_chuzhou_xinzhen(govement_penalty_base_spider): name = "ah_market_gov_chuzhou_xinzhen" custom_settings = { 'CONCURRENT_REQUESTS': '10', 'CONCURRENT_REQUESTS_PER_DOMAIN': '10', # 'DOWNLOAD_DELAY': 0.2, 'DOWNLOAD_TIMEOUT': 90, 'RETRY_TIMES': 30, 'HTTPERROR_ALLOWED_CODES': [407, 302], 'RETRY_HTTP_CODES': [504, 408, 500, 502, 503, 533, 407, 401, 403, 404, 400, 478], 'REDIRECT_ENABLED': False, 'COOKIES_ENABLED': False, 'DOWNLOADER_MIDDLEWARES': {'scrapy.downloadermiddlewares.retry.RetryMiddleware': None, 'extensions.proxy.retry.RetryMiddleware': 550, 'extensions.proxy.ip_utils.ProxyMiddleware': 555, 'scrapy.downloadermiddlewares.cookies.CookiesMiddleware': 700, 'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware': 750, 'scrapy_splash.SplashCookiesMiddleware': 800, 'scrapy_splash.SplashMiddleware': 850, 'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 900 }, } is_not_change_proxy = True # 只用一个代理 is_proxy = True proxy_type = PROXY_TYPE_WD proxy_count = 50 def __init__(self, increment=None, *args, **kwargs): super(ah_market_gov_chuzhou_xinzhen, self).__init__(*args, **kwargs) self.increment = increment self.headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9', 'Accept-Encoding': 'gzip, deflate, br', 'Accept-Language': 'zh-CN,zh;q=0.9', 'Host': 'www.chuzhou.gov.cn', 'Upgrade-Insecure-Requests': '1', 'Referer': 'https://www.chuzhou.gov.cn/public/column/108578180?type=4&catId=161735210&action=list&nav=3', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36', } self.js_file = """ var _0x500dd8 = '%s'; var _0x14e579 = '%s'; var _0x351708 = '%s'; var _0x41f35b = 'WZWS_METHOD'; var _0x349042 = 'WZWS_PARAMS'; var btoa = function (str_){ return new Buffer.from(str_, "binary").toString("base64") } function _0x13698a() { var _0x338d15 = 0x0; var _0xbe152f = 0x0; for (_0xbe152f = 0x0; _0xbe152f < _0x14e579.length; _0xbe152f++) { _0x338d15 += _0x14e579.charCodeAt(_0xbe152f); } _0x338d15 *= _0x351708; _0x338d15 += 0x1b207; return "WZWS_CONFIRM_PREFIX_LABEL" + _0x338d15; } """ self.index_url = "https://www.chuzhou.gov.cn/chuzhou/site/label/8888?IsAjax=1&dataType=html&_=0.6834244290108127&labelName=publicInfoList&siteId=2653861&pageSize=20&pageIndex=1&action=list&isDate=true&dateFormat=yyyy-MM-dd&length=50&organId=108578180&type=4&catId=161735210&cId=&result=%E6%9A%82%E6%97%A0%E7%9B%B8%E5%85%B3%E4%BF%A1%E6%81%AF&title=&fileNum=&keyWords=&file=%2Fc1%2Fchuzhou%2FpublicInfoList_newest" def start_requests(self): yield scrapy.Request(url=self.index_url, method='GET', headers=self.headers, encoding="utf-8", dont_filter=True) def parse(self, response): resp_url = response.url resp_meta = copy.deepcopy(response.meta) try: if "jsjiami.com.v6" in response.text: cookie_url = self.parse_cookie(response.text) header = deepCopy(self.headers) cookie_str, coolie_dict = getSetcookie2Str(response) header["Cookie"] = cookie_str yield scrapy.Request(url=cookie_url, method='GET', headers=header, encoding='UTF-8', dont_filter=True, meta=resp_meta, callback=self.parse) elif response.status == 302: header = deepCopy(self.headers) cookie_str, coolie_dict = getSetcookie2Str(response) header["Cookie"] = cookie_str if "list" in str(resp_meta) :yield scrapy.Request(url=resp_meta['list_url'], method='GET', headers=header,else: yield scrapy.Request(url=self.index_url, method='GET', headers=header, encoding='UTF-8', dont_filter=True, meta=resp_meta, callback=self.parse) else: page_number = re.findall(r'pageCount:(.*?),',response.text)[0] search_number = 2 if self.increment else int(page_number) for index in range(1,search_number + 1): send_url = 'https://www.chuzhou.gov.cn/chuzhou/site/label/8888?IsAjax=1&dataType=html&_=0.062391026092820656&labelName=publicInfoList&siteId=2653861&pageSize=20&pageIndex={}&action=list&isDate=true&dateFormat=yyyy-MM-dd&length=50&organId=108578180&type=4&catId=161735210&cId=&result=%E6%9A%82%E6%97%A0%E7%9B%B8%E5%85%B3%E4%BF%A1%E6%81%AF&title=&fileNum=&keyWords=&file=%2Fc1%2Fchuzhou%2FpublicInfoList_newest'.format(index) yield scrapy.Request(url=send_url, method='GET', headers=self.headers,meta=resp_meta, encoding="utf-8", dont_filter=True, callback=self.parse_list) except: traceback.print_exc() self.logger.info(f"parse error url: {resp_url}") def parse_list(self, response): resp_url = response.url resp_meta = copy.deepcopy(response.meta) try: if "jsjiami.com.v6" in response.text: yield scrapy.Request(url=resp_url, method='GET', headers=self.headers,meta={**resp_meta,'list_url':resp_url}, encoding="utf-8", dont_filter=True, callback=self.parse) else: resp_soup = BeautifulSoup(response.text, 'html5lib') detail_list = resp_soup.select('ul.xxgk_navli2') for detail in detail_list: if "href" in str(detail): detail_url = response.urljoin(detail.select_one('a')['href']) yield scrapy.Request(url=detail_url, method='GET', headers=self.headers, encoding="utf-8", dont_filter=True, callback=self.parse_detail) except: traceback.print_exc() self.logger.info(f"parse error url: {resp_url}") def parse_cookie(self, resp_body): _0x500dd8 = re.findall(r"_0x500dd8='(.*?)'", resp_body, re.DOTALL)[0] _0x14e579 = re.findall(r"_0x14e579='(.*?)'", resp_body, re.DOTALL)[0] _0x351708 = re.findall(r"_0x351708='(.*?)'", resp_body, re.DOTALL)[0] data = pyv8_engine_service(self.js_file % (_0x500dd8, _0x14e579, _0x351708), "_0x13698a") cookie_url = "http://www.chuzhou.gov.cn" + _0x500dd8 + "?wzwschallenge=" + base64.b64encode(data.encode()).decode() return cookie_url