selenium是一個自動化測試工具,利用它可以驅動瀏覽器執行特定的動作,如點擊,下拉等操作,同時還可以獲取瀏覽器當前呈現的頁面源碼,做到可見即可爬。常常被運用於爬取javascript動態渲染的頁面。
下面是其簡單用法:
通過瀏覽器發起請求獲取響應頁面源碼數據然后利用xpath進行數據提取
from selenium import webdriver from lxml import etree import time #實例化一個瀏覽器對象(傳入瀏覽器驅動) bro = webdriver.Chrome(executable_path='./chromedriver') #讓瀏覽器發起一個指定的url對應請求 bro.get('http://www.lvse.cn/xiaohua/') #page_source獲取當前頁面的頁面源碼數據 page_text = bro.page_source #數據解析 tree = etree.HTML(page_text) li_list = tree.xpath('//div[@id="slisting"]')
動作鏈以及iframe處理:
通過動作鏈可以完成滑塊滑動的操作
from selenium import webdriver from time import sleep from selenium.webdriver import ActionChains bro = webdriver.Chrome(executable_path='./chromedriver') bro.get('https://www.runoob.com/try/try.php?filename=jqueryui-api-droppable') #如果定位的標簽是存在於iframe標簽中的則必須通過如下操作進行標簽定位 bro.switch_to.frame('iframeResult') #切換到瀏覽器定位的作用域 div = bro.find_element_by_id('draggable') #實例化一個動作鏈 action = ActionChains(bro) #點擊長按指定的標簽 action.click_and_hold(div) for i in range(5): #peiform()立即執行動作鏈操作 #move_by_offset(x,y):x水平方向,y垂直方向 action.move_by_offset(17,0).perform() sleep(0.2) #釋放動作鏈 action.release()
實現無可視化界面以及實現規避檢測:
#實現無可視化界面 from selenium.webdriver.chrome.options import Options #實現規避檢測 from selenium.webdriver import ChromeOptions from time import sleep #實現無可視化界面 chrome_options = Options() chrome_options.add_argument('--headless') chrome_options.add_argument('--disable-gpu') #實現規避檢測 option = ChromeOptions() option.add_experimental_option('excludeSwitches', ['enable-automation'])
12306自動登錄操作:
實現原理:利用selenium進行點擊登錄,PIL進行截圖然后提交給超級鷹處理返回坐標,再利用selenium進行點擊圖片驗證碼登錄。
from selenium import webdriver from time import sleep from PIL import Image from selenium.webdriver import ActionChains from chaojiying import Chaojiying_Client from selenium.webdriver import ChromeOptions #反監測 option = ChromeOptions() option.add_experimental_option('excludeSwitches', ['enable-automation']) bro = webdriver.Chrome(executable_path='./chromedriver') all_list = [] def get_url(): bro.get('https://kyfw.12306.cn/otn/resources/login.html') sleep(1) bro.find_element_by_xpath('/html/body/div[2]/div[2]/ul/li[2]').click() get_url() def save(): code_img_ele = bro.find_element_by_id('J-loginImgArea') bro.save_screenshot('./aaa.png') location = code_img_ele.location size = code_img_ele.size rangel = ( int(location['x']), int(location['y']), int(location['x'] + size['width']), int(location['y'] + size['height'])) return rangel save() def crop(): rangel = save() i = Image.open('./aaa.png') code_img_name = 'code.png' frame = i.crop(rangel) frame.save(code_img_name) return code_img_name crop() def get_track(): code_img_name = crop() chaojiying = Chaojiying_Client('username', 'passwd', 'id')#超級鷹賬號密碼以及軟件id im = open(code_img_name, 'rb').read() result = chaojiying.PostPic(im, 9004)['pic_str'] return result get_track() def ensure_list(): code_img_ele = bro.find_element_by_id('J-loginImgArea') result = get_track() if '|' in result: list_1 = result.split('|') count_1 = len(list_1) for i in range(count_1): xy_list = [] x = int(list_1[i].split(',')[0]) y = int(list_1[i].split(',')[1]) xy_list.append(x) xy_list.append(y) all_list.append(xy_list) else: x = int(result.split(',')[0]) y = int(result.split(',')[1]) xy_list = [] xy_list.append(x) xy_list.append(y) all_list.append(xy_list) for l in all_list: x = l[0] y = l[1] print(x, y) ActionChains(bro).move_to_element_with_offset(code_img_ele, x, y).click().perform() sleep(0.5) ActionChains(bro).release() ensure_list() def login(): bro.find_element_by_id('J-userName').send_keys('username')#12306賬號 sleep(1) bro.find_element_by_id('J-password').send_keys('passwd')#12306密碼 sleep(1) bro.find_element_by_id('J-login').click() # 實現規避selenium檢測 script = 'Object.defineProperty(navigator,"webdriver",{get:() => false,});' bro.execute_script(script) sleep(3) hold_div = bro.find_element_by_xpath('//*[@id="nc_1_n1z"]') action = ActionChains(bro) action.click_and_hold(hold_div) action.move_by_offset(400, 0).perform() action.release() sleep(5) bro.quit() login()
當讓可用並不一定適用splash也可以進行動態渲染並且程序不會阻塞但是進行與瀏覽器也就略顯麻煩,selenium通常也是用於通過驗證碼。使用js代碼也可以與瀏覽器進行交互我也是比較推薦這種方式的。