美團的反爬機制是非常完善的,在用selenium登陸淘寶的時候發現美團能檢測到並彈出滑塊,然后無論怎么滑動都通過不了,在經過一番搜索后發現很多網站對selenium都有檢測機制,如檢測是否存在特有標識 。接下來我們簡單分享下使用代理訪問美團進行數據采集。
示例如下:
# -*- coding:UTF-8 -*- import time import re from datetime import date, timedelta from selenium import webdriver from selenium.common.exceptions import NoSuchElementException from selenium.webdriver import ActionChains from selenium.webdriver.chrome.options import Options TB_LOGIN_URL = 'https://meituan.com' CHROME_DRIVER = '/usr/local/bin/chromedriver' # Windows和Mac的配置路徑不一樣 class SessionException(Exception): """ 會話異常類 """ def __init__(self, message): super().__init__(self) self.message = message def __str__(self): return self.message class Crawler: def __init__(self): self.browser = None def start(self, username, password): print("初始化瀏覽器") self.__init_browser() print("切換至密碼輸入框") self.__switch_to_password_mode() time.sleep(0.5) print("輸入用戶名") self.__write_username(username) time.sleep(2.5) print("輸入密碼") self.__write_password(password) time.sleep(3.5) print("程序模擬解鎖") if self.__lock_exist(): self.__unlock() print("開始發起登錄請求") self.__submit() time.sleep(4.5) # 登錄成功,直接請求頁面 print("登錄成功,跳轉至目標頁面") self.__navigate_to_target_page() time.sleep(6.5) print("解析頁面文本") crawler_list = self.__parse_page_content(); # 連接數據庫並保存數據 print("保存數據到mysql數據庫") self.__save_list_to_db(crawler_list) def __switch_to_password_mode(self): """ 切換到密碼模式 :return: """ if self.browser.find_element_by_id('J_QRCodeLogin').is_displayed(): self.browser.find_element_by_id('J_Quick2Static').click() def __write_username(self, username): """ 輸入賬號 :param username: :return: """ username_input_element = self.browser.find_element_by_id('TPL_username_1') username_input_element.clear() username_input_element.send_keys(username) def __write_password(self, password): """ 輸入密碼 :param password: :return: """ password_input_element = self.browser.find_element_by_id("TPL_password_1") password_input_element.clear() password_input_element.send_keys(password) def __lock_exist(self): """ 判斷是否存在滑動驗證 :return: """ return self.__is_element_exist('#nc_1_wrapper') and self.browser.find_element_by_id( 'nc_1_wrapper').is_displayed()