所有美團方面旗下的登陸都采用重定向來解決登陸問題
即利用session 對話來解決登陸問題
當然也可以每次都模擬他的cookie來進行登陸
我用的代理是阿布雲代理 你們也可以選擇別代理
這次是爬取的美團旗下的榛果民宿
1 import requests 2 from urllib.parse import urlencode 3 import json 4 import time, datetime 5 import logging 6 from lxml import etree 7 import pymysql 8 from pymysql.err import IntegrityError 9 10 proxies_ = { 11 'http': '@http-dyn.abuyun.com:9020', 12 'https': '@http-dyn.abuyun.com:9020', 13 } 14 headers = { 15 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36 OPR/48.0.2685.52' 16 } 17 session = requests.Session() 18 19 20 def session_get(url, header=headers, tab=12): 21 if tab == 0: 22 return False 23 try: 24 response = session.get(url, headers=header, proxies=proxies_) 25 time.sleep(2) 26 return response if response.status_code == 200 else session_get(url, header, tab - 1) 27 except Exception as e: 28 if tab == 1: 29 logging.exception(e) 30 return session_get(url, header, tab - 1) 31 32 33 def session_post(url, header=headers, data=None, tab=12): 34 if tab == 0: 35 return False 36 try: 37 response = session.post(url, headers=header, data=data, proxies=proxies_) 38 time.sleep(2) 39 return response if response.status_code == 200 else session_post(url, header, data, tab - 1) 40 except Exception as e: 41 if tab == 1: 42 logging.exception(e) 43 return session_post(url, header, data, tab - 1) 44 45 46 def get_node_text(node, xpath): 47 """ 48 通過節點和xpath來獲取節點需要的內容 49 :param node: 50 :param xpath: 51 :return: 52 """ 53 try: 54 if xpath == "string(.)": return node.xpath('string(.)').strip() 55 if len(node.xpath(xpath)) > 0: 56 return node.xpath(xpath)[0].strip() if isinstance(node.xpath(xpath)[0], str) else node.xpath(xpath)[0] 57 return "" 58 except: 59 logging.exception('獲取xpath %s 出錯' % (xpath)) 60 return None 61 62 63 def get_youjia_tpp_conn(): 64 """ 65 獲取井隊數據庫連接 66 :return: 67 """ 68 return pymysql.connect(host='host', user='user', passwd='passwd', db='db', port=3306, 69 charset='utf8') 70 71 72 def storage_database_text(data_json, t_name, l_name="youjia_tpp"): 73 """ 74 非json類型數據存儲數據庫 75 :param data_json: 76 :param t_name: 77 :param l_name: 78 :return: 79 """ 80 now_time = str(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')) 81 data_list = [] 82 insert_sql = "INSERT INTO " + l_name + "." + t_name + " (" 83 update_sql = "UPDATE " + l_name + "." + t_name + " SET " 84 for key in data_json: 85 update_sql += str(key) + "=%s , " 86 if str(key) == "id": 87 id_key = data_json[key] 88 insert_sql += str(key) + "," 89 update_sql += "modify_time = '" + str(now_time) + "' where id = '" + str(id_key) + "'" 90 insert_sql = insert_sql[:-1] 91 insert_sql += ")VALUES(" 92 for key in data_json: 93 insert_sql += "%s," 94 data_list.append(str(data_json[key])) 95 insert_sql = insert_sql[:-1] 96 insert_sql += ");" 97 # print(update_sql) 98 # print(insert_sql) 99 with get_youjia_tpp_conn() as conn: 100 try: 101 print("storage_database_text insert_sql : ", t_name) 102 conn.execute(insert_sql, tuple(data_list)) 103 except IntegrityError: 104 print("storage_database_text update_sql : ", t_name) 105 conn.execute(update_sql, tuple(data_list)) 106 except Exception as msg: 107 logging.exception(msg) 108 109 110 def storage_database_json(id_, data_json, j_name, t_name, l_name="youjia_tpp"): 111 """ 112 存儲json形式至數據庫 113 :param id_: id 114 :param data_json: json 115 :param j_name: json的名字 116 :param t_name: 表名 117 :param l_name: 庫名 118 :return: 119 """ 120 now_time = str(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')) 121 insert_sql = "INSERT INTO " + l_name + "." + t_name + " (`id`,`" + j_name + "`)VALUES(%s,%s);" 122 updatesql = "update " + l_name + "." + t_name + " set `" + j_name + "`=%s , modify_time=%s where id = %s;" 123 # print(updatesql % (data_json, now_time, id_)) 124 with get_youjia_tpp_conn() as conn: 125 try: 126 print("storage_database_json insert_sql : ", t_name) 127 conn.execute(insert_sql, (id_, data_json)) 128 except IntegrityError: 129 print("storage_database_json update_sql : ", t_name) 130 conn.execute(updatesql, (data_json, now_time, id_)) 131 except Exception as msg: 132 logging.exception(msg) 133 134 135 def pre_login(): 136 try: 137 param = { 138 # 'uuid': 'e8514dbe200b4fde9393.1532912269.1.0.0', 139 'service': 'phoenix', 140 'continue': 'https://www.zhenguo.com/auth/authenticated/?continue=/help/trust/', 141 } 142 url = 'https://passport.meituan.com/account/unitivelogin?' + urlencode(param) 143 response = session_get(url=url, header=headers, tab=5) 144 if response.status_code == 200: 145 print("pre_login 成功") 146 return response.text 147 else: 148 return None 149 except ConnectionError as e: 150 print(e.args) 151 print('預登陸出錯') 152 153 154 def parse_param(html): 155 try: 156 html = etree.HTML(html) 157 csrf = html.xpath('//input[@name="csrf"]/@value')[0] 158 origin = html.xpath('//input[@name="origin"]/@value')[0] 159 fingerprint = html.xpath('//input[@name="fingerprint"]/@value')[0] 160 uuid = html.xpath('//i[@class="form-uuid"]/text()')[0] 161 need_captcha = html.xpath('//div[@class="form-field J-form-field-captcha form-field--captcha"]/@style')[ 162 0].replace("display:", "") 163 return (csrf, uuid, need_captcha, origin, fingerprint) 164 except: 165 print('解析csrf,uuid,need_captcha出錯') 166 167 168 def formal_login(username, password, param): 169 csrf = param[0] 170 uuid = param[1] 171 origin, fingerprint = param[3], param[4] 172 if 1 == 1: 173 captcha_param = { 174 'uuid': uuid, 175 } 176 url = 'https://passport.meituan.com/account/captcha?' + urlencode(captcha_param) 177 print(url) 178 image_resp = session_get(url) 179 with open('C:/Users/admin/Desktop/image/zg.jpg', 'wb') as file: 180 file.write(image_resp.content) 181 captcha = input('需要驗證碼:') 182 # else: 183 # captcha = '' 184 url_param = { 185 'uuid': uuid, 186 'service': 'phoenix', 187 'continue': 'https://www.zhenguo.com/auth/authenticated/?continue=/help/trust/', 188 } 189 postdata = { 190 'email': username, 191 'password': password, 192 'captcha': captcha, 193 'origin': origin, 194 'fingerprint': fingerprint, 195 'csrf': csrf 196 } 197 url = 'https://passport.meituan.com/account/unitivelogin?' + urlencode(url_param) 198 try: 199 response = session_post(url, data=postdata, header=headers) 200 if response.status_code == 200: 201 print("登陸成功!") 202 return response.text 203 else: 204 return None 205 except ConnectionError as e: 206 print(e.args) 207 print('登錄出錯') 208 209 210 def parse_token(html): 211 try: 212 html = etree.HTML(html) 213 action_url = html.xpath('//form[@class="J-form mainbox__content"]/@action')[0] 214 token = html.xpath('//input[@name="token"]/@value')[0] 215 expire = html.xpath('//input[@name="expire"]/@value')[0] 216 isdialog = html.xpath('//input[@name="isdialog"]/@value')[0] 217 autologin = html.xpath('//input[@name="autologin"]/@value')[0] 218 csrf = html.xpath('//*[@id="csrf"]/text()')[0] 219 220 # headers['x-csrf-token'] = csrf 221 # trust_response = session.post(action_url, data=postdata, headers=headers) 222 # print(trust_response.text) 223 return {"action_url": action_url, "token": token, "expire": expire, "isdialog": isdialog, 224 "autologin": autologin, "csrf": csrf} 225 except: 226 logging.exception('解析token出錯') 227 228 229 def redirect_login(token_json): 230 """ 231 {"action_url": action_url, "token": token, "expire": expire, "isdialog": isdialog, 232 "autologin": autologin, "csrf": csrf} 233 :param token: 234 :return: 235 """ 236 postdata = { 237 'token': token_json['token'], 238 'expire': token_json['expire'], 239 'isdialog': token_json['isdialog'], 240 'autologin': token_json['autologin'], 241 'logintype': 'normal' 242 } 243 headers['x-csrf-token'] = token_json['csrf'] 244 try: 245 trust_response = session_post(token_json['action_url'], data=postdata, header=headers) 246 print("重定向成功!!") 247 # tt = session.get("https://www.zhenguo.com/house/list/", headers=t_h) 248 except ConnectionError as e: 249 print(e.args) 250 print('重定向出錯') 251 252 253 def test(): 254 try: 255 time.sleep(5) 256 url = 'http://maoyan.com/profile' 257 response = session_get(url, header=headers) 258 print(response.status_code) 259 print(response.text) 260 except ConnectionError as e: 261 print(e.args) 262 print('測試出錯') 263 264 265 def crawl_order(account_id, token, page_no=1, page_size=20): 266 orders_url = "https://www.zhenguo.com/host/orders/" 267 response = session_get(orders_url, header=headers) 268 print(response.status_code) 269 html = etree.HTML(response.text) 270 csrf = html.xpath('//meta[@name="csrf-token"]/@content')[0] 271 headers['x-csrf-token'] = csrf 272 print(csrf) 273 queryOrderByTypeUrl = "https://www.zhenguo.com/gw/order/api/v1/orderSearch/queryOrderByType" 274 OrderByType = {'pageNow': page_no, 'pageSize': page_size, 'orderStatusType': 9} 275 headers['Accept'] = "application/json" 276 headers['Content-Type'] = "application/json" 277 query_response = session_post(queryOrderByTypeUrl, data=json.dumps(OrderByType), header=headers) 278 query_json = query_response.json() 279 query_list = query_json['data']['list'] 280 print(len(query_list)) 281 for order_json in query_list: 282 order_id = order_json['orderId'] 283 storage_database_json(order_id, json.dumps(order_json), 'order', 'zhenguo_order') 284 storage_database_text({"id": order_id, 'account_id': account_id}, 'zhenguo_order') 285 286 if len(query_list) == page_size: 287 crawl_order(account_id, page_no + 1) 288 289 290 def house_detail(list_json): 291 """ 292 解析房屋詳情的 293 :param list_json: 294 :return: 295 """ 296 room_id = list_json["id"] 297 room_url = "https://www.zhenguo.com/housing/%s" % room_id 298 room_response = session_get(room_url) 299 if room_response: 300 html = etree.HTML(room_response.text) 301 room_type = get_node_text(html, 302 '//*[@id="J-layout"]/div[2]/div/div[2]/div/div[2]/section[1]/div[2]/span[1]/text()') 303 list_json["room_type"] = room_type 304 house_wear = get_node_text(html, 305 '//*[@id="J-layout"]/div[2]/div/div[2]/div/div[2]/section[1]/div[2]/span[2]/text()') 306 list_json["house_wear"] = house_wear 307 room_area = get_node_text(html, 308 '//*[@id="J-layout"]/div[2]/div/div[2]/div/div[2]/section[1]/div[2]/span[3]/text()') 309 list_json["room_area"] = room_area 310 for node in html.xpath('//*[@id="J-layout"]/div[2]/div/div[2]/div/div[2]/section[2]/ul/li'): 311 text = get_node_text(node, './div[1]/text()') 312 node_detail = get_node_text(node, './div[2]/text()') 313 if text == "房源": 314 room_count = node_detail 315 list_json["room_count"] = room_count 316 if text == "評價": 317 comment_count = node_detail 318 list_json["comment_count"] = comment_count 319 if text == "咨詢回復率": 320 rep_rate = node_detail 321 list_json["rep_rate"] = rep_rate 322 if text == "咨詢回復時長": 323 rep_length = node_detail 324 list_json["rep_length"] = rep_length 325 str(1).strip() 326 reserve = get_node_text(html, '//*[@id="J-layout"]/div[2]/div/' 327 'div[2]/div/div[2]/section[8]/ul[1]/li[2]/text()').split(",") 328 # list_json["reserve"] = reserve 329 if len(reserve) > 1: 330 less_day = reserve[0].replace("最少預訂", "").replace("天", "").strip() 331 more_day = reserve[1].replace("最多預訂", "").replace("天", "").strip() 332 list_json["less_day"] = less_day 333 list_json["more_day"] = more_day 334 unsubscribe = get_node_text(html, '//*[@id="J-layout"]/div[2]/div/div[2]/div/div[2]/section[8]/ul[2]/li/text()') 335 list_json["unsubscribe"] = unsubscribe 336 return list_json 337 338 339 def crawl_room(account_id, token): 340 comment_url = "https://www.zhenguo.com/gw/ugc/api/v1/product/comments?productId=%s&pageNow=1&pageSize=100" 341 room_list_url = "https://www.zhenguo.com/house/list/" 342 room_response = session_get(url=room_list_url, header=headers) 343 if room_response: 344 html = etree.HTML(room_response.text) 345 for node in html.xpath('//div[@class="houseCard__block"]'): 346 title = get_node_text(node, './div[@class="houseCard__titleLine"]/text()') # 標題 347 price = get_node_text(node, './div[@class="houseCard__addLine clearfix"]' 348 '/span[1]/span[@class="houseCard__price"]/text()').replace("¥", "") # 價格 349 state = get_node_text(node, './div[@class="houseCard__bottomLine clearfix"]/' 350 'div[1]/span[@class="houseCard__verifyStatus-5"]/text()') # 狀態 351 room_id = get_node_text(node, './div[@class="houseCard__bottomLine clearfix"]' 352 '/div[1]/@data-product-id') # 房源id 353 print(account_id, title, price, state, room_id) 354 list_json = {"account_id": account_id, "title": title, 355 "price": price, "state": state, "id": room_id, "room_id": room_id} 356 comment_ = comment_url % room_id 357 358 house_json = house_detail(list_json) 359 response = session_get(url=comment_) 360 if response: 361 print(response.text) 362 storage_database_json(room_id, json.dumps(response.json()), "comment", "zhenguo_room_info", 363 l_name="youjia_tpp") 364 storage_database_text(house_json, 'zhenguo_room_info') 365 366 367 def crawl_room_list(account_id, token): 368 app_header = {"User-Agent": "Mozilla/5.0 (Linux; Android 5.1.1; vivo X7 Build/LMY47V) AppleWebKit/537.36 " 369 "(KHTML, like Gecko) Version/4.0 Chrome/39.0.0.0 Mobile Safari/537.36 TitansX/11.6.12 " 370 "KNB/1.2.0 android/5.1.1 phoenix/com.meituan.phoenix/2.6.0 com.meituan.phoenix/2.6.0", 371 "Cookie": "token=" + token} 372 list_url = "https://iphx.meituan.com/ds/product/online/list" 373 list_resp = session_get(url=list_url, header=app_header) 374 if list_resp: 375 list_json = list_resp.json() 376 for room_json in list_json['data']['list']: 377 room_id = room_json['productId'] 378 product_quota_url = "https://iphx.meituan.com/api/product/api/v1/product/getProductQuota/"+str(room_id) 379 product_quota_resp = session_get(url=product_quota_url, header=app_header) 380 print(room_json) 381 print(product_quota_resp.json()['data']) 382 383 384 385 def crawl(account_id, token): 386 """ 387 登錄的session搞定之后 開始爬取詳細信息 388 :return: 389 """ 390 crawl_room_list(account_id, token) # 爬取手機端信息 391 392 # crawl_room(account_id, token) # 房屋爬取 393 # crawl_order(account_id, token) # 訂單爬蟲 394 395 396 def login(username, password): 397 html_pre_login = pre_login() 398 param = parse_param(html_pre_login) 399 print("param: ", param) 400 html_login = formal_login(username, password, param) 401 # print(html_login) 402 token_json = parse_token(html_login) 403 print("token_json: ", token_json) 404 redirect_login(token_json) 405 return token_json['token'] 406 407 408 if __name__ == '__main__': 409 username = 'username' 410 password = 'username' 411 token = login(username, password) 412 crawl(1, token)