selenium模塊
selenium 最初是一個自動化測試工具,而爬蟲中使用它主要是為了解決 requests 無法直接執行 JavaScript 代碼的問題。selenium 的缺點是效率會變得很慢。
selenium 本質是通過驅動瀏覽器,完全模擬瀏覽器的操作,比如跳轉、輸入、點擊、下拉等,來拿到網頁渲染之后的結果,可支持多種瀏覽器
from selenium import webdriver browser=webdriver.Chrome() browser=webdriver.Firefox() browser=webdriver.PhantomJS() browser=webdriver.Safari() browser=webdriver.Edge()
一、安裝
安裝:
pip3 install selenium
要自動啟動瀏覽器需要安裝相應的驅動,selenium3 默認支持的 webdriver 是 Firfox,而 Firefox 需要安裝 geckodriver
火狐瀏覽器安裝 driver
下載鏈接:https://github.com/mozilla/geckodriver/releases
- Linux 下載后將 geckodriver 移動到 /usr/local/bin/ 目錄下(或者軟連接到 /usr/bin/chromdriver)
- Windows下載后將 geckodriver.exe 加入環境變量,或者將 geckodriver.exe 移動到腳本同級目錄
谷歌瀏覽器安裝 driver:
- windows:下載 chromdriver.exe 放到python安裝路徑的 scripts 目錄中即可,注意最新版本是2.xx,並非2.9
- Linux:下載 chromdriver.zip,解壓縮后將 chromdriver 文件復制到 /usr/bin 文件夾下(或者軟連接到 /usr/bin/chromdriver)
國內鏡像網站地址:http://npm.taobao.org/mirrors/chromedriver/
最新的版本去官網找:https://sites.google.com/a/chromium.org/chromedriver/downloads
from selenium import webdriver # 彈出 chrome 瀏覽器 driver=webdriver.Chrome() driver.get('https://www.baidu.com') driver.page_source #能自動彈出瀏覽器並訪問百度,就說明成功了 # 彈出 firefox瀏覽器 brow=webdriver.Firefox() brow.get('https://www.baidu.com')

#安裝:selenium+phantomjs pip3 install selenium 下載phantomjs,解壓后把phantomjs.exe所在的bin目錄放到環境變量 下載鏈接:http://phantomjs.org/download.html #驗證安裝 C:\Users\Administrator>phantomjs phantomjs> console.log('egon gaga') egon gaga undefined phantomjs> ^C C:\Users\Administrator>python3 Python 3.6.1 (v3.6.1:69c0db5, Mar 21 2017, 18:41:36) [MSC v.1900 64 bit (AMD64)] on win32 Type "help", "copyright", "credits" or "license" for more information. >>> from selenium import webdriver >>> driver=webdriver.PhantomJS() #無界面瀏覽器 >>> driver.get('https://www.baidu.com') >>> driver.page_source
二、等待元素被加載
selenium 是自動運行瀏覽器的一種行為,而瀏覽器的加載是需要時間的,我們在進行操作之前肯定必須要等到被操作的元素加載出來后才行,所以我們就需要用到等待的操作、
等待的方式分兩種
#2、等待的方式分兩種: 隱式等待:在browser.get('xxx')前就設置,針對所有元素有效 顯式等待:在browser.get('xxx')之后設置,只針對某個元素有效
示例:

from selenium import webdriver from selenium.webdriver import ActionChains from selenium.webdriver.common.by import By #按照什么方式查找,By.ID,By.CSS_SELECTOR from selenium.webdriver.common.keys import Keys #鍵盤按鍵操作 from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.support.wait import WebDriverWait #等待頁面加載某些元素 browser=webdriver.Chrome()#啟動瀏覽器 """ 方式一 隱式等待:在查找所有元素時,如果尚未被加載,則等10秒,在最前面設置 """ browser.implicitly_wait(10) browser.get('https://www.baidu.com') input_tag=browser.find_element_by_id('kw') input_tag.send_keys('美女') input_tag.send_keys(Keys.ENTER) """ 方式二 顯式等待:顯式地等待某個指定元素被加載 這兩條效果與上面一條相同 """ #wait=WebDriverWait(browser,10) #wait.until(EC.presence_of_element_located((By.ID,'content_left'))) #查找內容 contents=browser.find_element_by_id('content_left') #沒有進行等待操作就直接查找的話,找不到就會報錯 print(contents) browser.close()
三、選擇器
選擇器就是用來做對象爬取的內容進行定位用的,選擇器有很多,每種的功能都不同
基本選擇器

#官網鏈接:http://selenium-python.readthedocs.io/locating-elements.html from selenium import webdriver from selenium.webdriver import ActionChains from selenium.webdriver.common.by import By #按照什么方式查找,By.ID,By.CSS_SELECTOR from selenium.webdriver.common.keys import Keys #鍵盤按鍵操作 from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.support.wait import WebDriverWait #等待頁面加載某些元素 import time driver=webdriver.Chrome() driver.get('https://www.baidu.com') wait=WebDriverWait(driver,10) try: #===============所有方法=================== # 1、find_element_by_id # 2、find_element_by_link_text # 3、find_element_by_partial_link_text #模糊匹配 # 4、find_element_by_tag_name # 5、find_element_by_class_name # 6、find_element_by_name # 7、find_element_by_css_selector # 8、find_element_by_xpath #后續詳解 # 強調: # 1、上述均可以改寫成find_element(By.ID,'kw')的形式 # 2、find_elements_by_xxx的形式是查找到多個元素,結果為列表 #===============示范用法=================== # 1、find_element_by_id print(driver.find_element_by_id('kw')) # 2、find_element_by_link_text # login=driver.find_element_by_link_text('登錄') # login.click() # 3、find_element_by_partial_link_text login=driver.find_elements_by_partial_link_text('錄')[0] login.click() # 4、find_element_by_tag_name print(driver.find_element_by_tag_name('a')) # 5、find_element_by_class_name button=wait.until(EC.element_to_be_clickable((By.CLASS_NAME,'tang-pass-footerBarULogin'))) button.click() # 6、find_element_by_name input_user=wait.until(EC.presence_of_element_located((By.NAME,'userName'))) input_pwd=wait.until(EC.presence_of_element_located((By.NAME,'password'))) commit=wait.until(EC.element_to_be_clickable((By.ID,'TANGRAM__PSP_10__submit'))) input_user.send_keys('18611453110') input_pwd.send_keys('lhf@094573') commit.click() # 7、find_element_by_css_selector driver.find_element_by_css_selector('#kw') # 8、find_element_by_xpath time.sleep(5) finally: driver.close()
xpath

#官網鏈接:http://selenium-python.readthedocs.io/locating-elements.html from selenium import webdriver from selenium.webdriver import ActionChains from selenium.webdriver.common.by import By #按照什么方式查找,By.ID,By.CSS_SELECTOR from selenium.webdriver.common.keys import Keys #鍵盤按鍵操作 from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.support.wait import WebDriverWait #等待頁面加載某些元素 import time driver=webdriver.PhantomJS() driver.get('https://doc.scrapy.org/en/latest/_static/selectors-sample1.html') # wait=WebDriverWait(driver,3) driver.implicitly_wait(3) #使用隱式等待 try: # find_element_by_xpath #//與/的不同 # driver.find_element_by_xpath('//body/a') # 開頭的//代表從整篇文檔中尋找,body之后的/代表body的兒子,這一行找不到就會報錯了 driver.find_element_by_xpath('//body//a') # 開頭的//代表從整篇文檔中尋找,body之后的//代表body的子子孫孫 driver.find_element_by_css_selector('body a') #取第n個 res1=driver.find_elements_by_xpath('//body//a[1]') #取第一個a標簽,這里的索引是從1開始 print(res1[0].text) #按照屬性查找,下述三者查找效果一樣 res1=driver.find_element_by_xpath('//a[5]') res2=driver.find_element_by_xpath('//a[@href="image5.html"]') res3=driver.find_element_by_xpath('//a[contains(@href,"image5")]') #模糊查找 print('==>', res1.text) print('==>',res2.text) print('==>',res3.text) #其他 res1=driver.find_element_by_xpath('/html/body/div/a') print(res1.text) res2=driver.find_element_by_xpath('//a[img/@src="image3_thumb.jpg"]') #找到子標簽img的src屬性為image3_thumb.jpg的a標簽 print(res2.tag_name,res2.text) res3 = driver.find_element_by_xpath("//input[@name='continue'][@type='button']") #查看屬性name為continue且屬性type為button的input標簽 res4 = driver.find_element_by_xpath("//*[@name='continue'][@type='button']") #查看屬性name為continue且屬性type為button的所有標簽 time.sleep(5) finally: driver.close()
獲取標簽屬性

from selenium import webdriver from selenium.webdriver import ActionChains from selenium.webdriver.common.by import By #按照什么方式查找,By.ID,By.CSS_SELECTOR from selenium.webdriver.common.keys import Keys #鍵盤按鍵操作 from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.support.wait import WebDriverWait #等待頁面加載某些元素 browser=webdriver.Chrome() browser.get('https://www.amazon.cn/') wait=WebDriverWait(browser,10) wait.until(EC.presence_of_element_located((By.ID,'cc-lm-tcgShowImgContainer'))) tag=browser.find_element(By.CSS_SELECTOR,'#cc-lm-tcgShowImgContainer img') #獲取標簽屬性, print(tag.get_attribute('src')) #獲取標簽ID,位置,名稱,大小(了解) print(tag.id) print(tag.location) print(tag.tag_name) print(tag.size) browser.close()
四、元素交互操作
交互操作就是指模擬人在網頁中進行輸入或點擊鼠標的操作,主要針對的是 input 框和鏈接。交互操作中可以自己植入 js 代碼
模擬天貓搜索:

from selenium import webdriver from selenium.webdriver import ActionChains from selenium.webdriver.common.by import By #按照什么方式查找,By.ID,By.CSS_SELECTOR from selenium.webdriver.common.keys import Keys #鍵盤按鍵操作 from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.support.wait import WebDriverWait #等待頁面加載某些元素 import time driver=webdriver.Chrome() driver.get('https://www.tmall.com/') driver.implicitly_wait(3) try: input_tag=driver.find_element_by_id('mq')#搜索框 input_tag.send_keys('NIKE')#寫入搜索條件 input_tag.send_keys(Keys.ENTER)#回車 time.sleep(3) input_tag=driver.find_element_by_id('mq') input_tag.clear()#清空搜索框 input_tag.send_keys('科比戰靴') input_tag.send_keys(Keys.ENTER) time.sleep(5) finally: driver.close()
模擬滑動驗證碼操作(網頁很low逼):

from selenium import webdriver from selenium.webdriver import ActionChains from selenium.webdriver.common.by import By #按照什么方式查找,By.ID,By.CSS_SELECTOR from selenium.webdriver.common.keys import Keys #鍵盤按鍵操作 from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.support.wait import WebDriverWait #等待頁面加載某些元素 import time driver=webdriver.Chrome() driver.get('http://www.runoob.com/try/try.php?filename=jqueryui-api-droppable') #driver.get('http://www.baidu.com') driver.implicitly_wait(3) try: driver.switch_to.frame('iframeResult')#已被棄用的html標簽,作用是html下套html driver.switch_to.parent_frame()#返回父級html標簽 source=driver.find_element_by_id('draggable')#源滑塊 target=driver.find_element_by_id('droppable')#目標 print(source,target) #方式一: # actions=ActionChains(driver) # actions.drag_and_drop(source,target) # actions.perform() #方式二: distance=target.location['x']-source.location['x']#距離 ActionChains(driver).click_and_hold(source).perform()#模擬按住鼠標不放手 print(distance) s=0 while s < distance: print(s) ActionChains(driver).move_by_offset(xoffset=2,yoffset=0).perform()#每次移動2像素 s+=2 ActionChains(driver).release().perform()#釋放鼠標 driver.execute_script('alert("xxxxxxxxx")')#可自寫js代碼,會執行 time.sleep(6) finally: driver.close()
補充:frame的切換

#frame相當於一個單獨的網頁,在父frame里是無法直接查看到子frame的元素的,必須switch_to_frame切到該frame下,才能進一步查找 from selenium import webdriver from selenium.webdriver import ActionChains from selenium.webdriver.common.by import By #按照什么方式查找,By.ID,By.CSS_SELECTOR from selenium.webdriver.common.keys import Keys #鍵盤按鍵操作 from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.support.wait import WebDriverWait #等待頁面加載某些元素 try: browser=webdriver.Chrome() browser.get('http://www.runoob.com/try/try.php?filename=jqueryui-api-droppable') browser.switch_to.frame('iframeResult') #切換到id為iframeResult的frame tag1=browser.find_element_by_id('droppable') print(tag1) # tag2=browser.find_element_by_id('textareaCode') #報錯,在子frame里無法查看到父frame的元素 browser.switch_to.parent_frame() #切回父frame,就可以查找到了 tag2=browser.find_element_by_id('textareaCode') print(tag2) finally: browser.close()
模擬瀏覽器的前進后退

#模擬瀏覽器的前進后退 import time from selenium import webdriver browser=webdriver.Chrome() browser.get('https://www.baidu.com') browser.get('https://www.taobao.com') browser.get('http://www.sina.com.cn/') browser.back()#后腿 time.sleep(10) browser.forward()#前進 browser.close()

#cookies from selenium import webdriver browser=webdriver.Chrome() browser.get('https://www.zhihu.com/explore') print(browser.get_cookies()) browser.add_cookie({'k1':'xxx','k2':'yyy'}) print(browser.get_cookies()) # browser.delete_all_cookies()
選項卡管理

#選項卡管理:切換選項卡,有js的方式windows.open,有windows快捷鍵:ctrl+t等,最通用的就是js的方式 import time from selenium import webdriver browser=webdriver.Chrome() browser.get('https://www.baidu.com') browser.execute_script('window.open()') print(browser.window_handles) #獲取所有的選項卡 browser.switch_to_window(browser.window_handles[1]) browser.get('https://www.taobao.com') time.sleep(10) browser.switch_to_window(browser.window_handles[0]) browser.get('https://www.sina.com.cn') browser.close()
異常處理

from selenium import webdriver from selenium.common.exceptions import TimeoutException,NoSuchElementException,NoSuchFrameException try: browser=webdriver.Chrome() browser.get('http://www.runoob.com/try/try.php?filename=jqueryui-api-droppable') browser.switch_to.frame('iframssseResult') except TimeoutException as e: print(e) except NoSuchFrameException as e: print(e) finally: browser.close()
小項目擴展

#注意:網站都策略都是在不斷變化的,精髓在於學習流程。下述代碼生效與2017-11-7,不能保證永久有效 from selenium import webdriver from selenium.webdriver import ActionChains from selenium.webdriver.common.by import By from selenium.webdriver.common.keys import Keys from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.support.wait import WebDriverWait browser=webdriver.Chrome() try: browser.get('http://mail.163.com/') wait=WebDriverWait(browser,5) frame=wait.until(EC.presence_of_element_located((By.ID,'x-URS-iframe'))) browser.switch_to.frame(frame) wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'.m-container'))) inp_user=browser.find_element_by_name('email') inp_pwd=browser.find_element_by_name('password') button=browser.find_element_by_id('dologin') inp_user.send_keys('18611453110') inp_pwd.send_keys('xxxx') button.click() #如果遇到驗證碼,可以把下面一小段打開注釋 # import time # time.sleep(10) # button = browser.find_element_by_id('dologin') # button.click() wait.until(EC.presence_of_element_located((By.ID,'dvNavTop'))) write_msg=browser.find_elements_by_css_selector('#dvNavTop li')[1] #獲取第二個li標簽就是“寫信”了 write_msg.click() wait.until(EC.presence_of_element_located((By.CLASS_NAME,'tH0'))) recv_man=browser.find_element_by_class_name('nui-editableAddr-ipt') title=browser.find_element_by_css_selector('.dG0 .nui-ipt-input') recv_man.send_keys('378533872@qq.com') title.send_keys('聖旨') print(title.tag_name) frame=wait.until(EC.presence_of_element_located((By.CLASS_NAME,'APP-editor-iframe'))) browser.switch_to.frame(frame) body=browser.find_element(By.CSS_SELECTOR,'body') body.send_keys('egon很帥,可以加工資了') browser.switch_to.parent_frame() #切回他爹 send_button=browser.find_element_by_class_name('nui-toolbar-item') send_button.click() #可以睡時間久一點別讓瀏覽器關掉,看看發送成功沒有 import time time.sleep(10000) except Exception as e: print(e) finally: browser.close()

from selenium import webdriver from selenium.webdriver import ActionChains from selenium.webdriver.common.by import By #按照什么方式查找,By.ID,By.CSS_SELECTOR from selenium.webdriver.common.keys import Keys #鍵盤按鍵操作 from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.support.wait import WebDriverWait #等待頁面加載某些元素 import time def get_goods(driver): try: goods=driver.find_elements_by_class_name('gl-item') for good in goods: detail_url=good.find_element_by_tag_name('a').get_attribute('href') p_name=good.find_element_by_css_selector('.p-name em').text.replace('\n','') price=good.find_element_by_css_selector('.p-price i').text p_commit=good.find_element_by_css_selector('.p-commit a').text msg = ''' 商品 : %s 鏈接 : %s 價錢 :%s 評論 :%s ''' % (p_name,detail_url,price,p_commit) print(msg,end='\n\n') button=driver.find_element_by_partial_link_text('下一頁') button.click() time.sleep(1) get_goods(driver) except Exception: pass def spider(url,keyword): driver = webdriver.Chrome() driver.get(url) driver.implicitly_wait(3) # 使用隱式等待 try: input_tag=driver.find_element_by_id('key') input_tag.send_keys(keyword) input_tag.send_keys(Keys.ENTER) get_goods(driver) finally: driver.close() if __name__ == '__main__': spider('https://www.jd.com/',keyword='科比戰靴')

#首先要安裝Pillow pip3 install pillow #Pillow:基於PIL,處理python 3.x的圖形圖像庫.因為PIL只能處理到python 2.x,而這個模塊能處理Python3.x,目前用它做圖形的很多. ###########思路整理########## from selenium import webdriver from selenium.webdriver import ActionChains from selenium.webdriver.common.by import By #按照什么方式查找,By.ID,By.CSS_SELECTOR from selenium.webdriver.common.keys import Keys #鍵盤按鍵操作 from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.support.wait import WebDriverWait #等待頁面加載某些元素 from PIL import Image #pip3 install pillow import time def get_snap(driver): driver.save_screenshot('snap.png')#截圖 snap_obj=Image.open('snap.png')#保存 return snap_obj def get_image(driver): img=driver.find_element_by_class_name('geetest_canvas_img') time.sleep(2) #等待圖片加載完畢 size=img.size location=img.location #獲取圖片位置 left=location['x'] top=location['y'] right=left+size['width'] bottom=top+size['height'] snap_obj=get_snap(driver) image_obj=snap_obj.crop((left,top,right,bottom))#截圖操作 # image_obj.show() return image_obj def get_distance(image1,image2): start_x=58#滑塊最左側 threhold=60#去除偽影響 # print(image1.size) # print(image2.size) for x in range(start_x,image1.size[0]): for y in range(image1.size[1]): rgb1=image1.load()[x,y] rgb2=image2.load()[x,y] res1=abs(rgb1[0]-rgb2[0]) res2=abs(rgb1[1]-rgb2[1]) res3=abs(rgb1[2]-rgb2[2]) if not (res1 < threhold and res2 < threhold and res3 < threhold): return x-7#誤差范圍 def get_tracks(distance): distance+=20#故意划過頭20像素 #v=v0+a*t #s=v*t+0.5*a*(t**2) v0=0 s=0 t=0.2 mid=distance*3/5 forward_tracks=[] while s < distance: if s < mid: a=2 else: a=-3 v=v0 track=v*t+0.5*a*(t**2) track=round(track)#取整數 v0=v+a*t s+=track forward_tracks.append(track) back_tracks=[-1,-1,-1,-2,-2,-2,-3,-3,-2,-2,-1] #20 return {"forward_tracks":forward_tracks,'back_tracks':back_tracks} try: driver = webdriver.Chrome()#谷歌瀏覽器 driver.get('https://passport.cnblogs.com/user/signin')#博客園 driver.implicitly_wait(10)#隱形等待10秒 #1、輸入賬號、密碼,然后點擊登陸 input_user=driver.find_element_by_id('input1') input_pwd=driver.find_element_by_id('input2') login_button=driver.find_element_by_id('signin') input_user.send_keys('wall-a')#輸入賬號 input_pwd.send_keys('lg19950726..')#輸入密碼 login_button.click()#點擊登錄按鈕 #2、點擊驗證人機按鈕,彈出沒有缺口的圖 button=driver.find_element_by_class_name('geetest_radar_tip_content') button.click() #3、針對沒有缺口的圖片進行截圖 image1=get_image(driver) #4、點擊滑動按鈕,彈出有缺口的圖 slider_button=driver.find_element_by_class_name('geetest_slider_button') slider_button.click() #5、針對有缺口的圖片進行截圖 image2=get_image(driver) #6、對比兩張圖片,找出缺口,即滑動的位移 distance=get_distance(image1,image2) # print(distance) #7、按照人的行為行為習慣,把總位移切成一段段小的位移 traks_dic=get_tracks(distance) #8、按照位移移動 slider_button=driver.find_element_by_class_name('geetest_slider_button') ActionChains(driver).click_and_hold(slider_button).perform()#按住不放手 #先向前移動 forward_tracks=traks_dic["forward_tracks"] back_tracks=traks_dic["back_tracks"] for forward_track in forward_tracks: ActionChains(driver).move_by_offset(xoffset=forward_track,yoffset=0).perform() #短暫停頓,發現傻逼,移過了 time.sleep(0.2) # 先向后移動 for back_track in back_tracks: ActionChains(driver).move_by_offset(xoffset=back_track,yoffset=0).perform() # 抖一抖 ActionChains(driver).move_by_offset(xoffset=-4,yoffset=0).perform() ActionChains(driver).move_by_offset(xoffset=3,yoffset=0).perform() time.sleep(0.1) ActionChains(driver).move_by_offset(xoffset=-2,yoffset=0).perform() ActionChains(driver).move_by_offset(xoffset=3,yoffset=0).perform() time.sleep(0.3) ActionChains(driver).release().perform()#松開鼠標 time.sleep(10) finally: driver.close() ###############優化后的代碼(將功能封裝成函數調用)####### from selenium import webdriver from selenium.webdriver import ActionChains from selenium.webdriver.common.by import By #按照什么方式查找,By.ID,By.CSS_SELECTOR from selenium.webdriver.common.keys import Keys #鍵盤按鍵操作 from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.support.wait import WebDriverWait #等待頁面加載某些元素 from PIL import Image #pip3 install pillow import time def get_snap(driver): driver.save_screenshot('snap.png')#截圖 snap_obj=Image.open('snap.png')#保存 return snap_obj def get_image(driver): img=driver.find_element_by_class_name('geetest_canvas_img') time.sleep(2) #等待圖片加載完畢 size=img.size location=img.location #獲取圖片位置 left=location['x'] top=location['y'] right=left+size['width'] bottom=top+size['height'] snap_obj=get_snap(driver) image_obj=snap_obj.crop((left,top,right,bottom))#截圖操作 # image_obj.show() return image_obj def get_distance(image1,image2): start_x=58#滑塊最左側 threhold=60#去除偽影響 # print(image1.size) # print(image2.size) for x in range(start_x,image1.size[0]): for y in range(image1.size[1]): rgb1=image1.load()[x,y] rgb2=image2.load()[x,y] res1=abs(rgb1[0]-rgb2[0]) res2=abs(rgb1[1]-rgb2[1]) res3=abs(rgb1[2]-rgb2[2]) if not (res1 < threhold and res2 < threhold and res3 < threhold): return x-7#誤差范圍 def get_tracks(distance): distance+=20#故意划過頭20像素 #v=v0+a*t #s=v*t+0.5*a*(t**2) v0=0 s=0 t=0.2 mid=distance*3/5 forward_tracks=[] while s < distance: if s < mid: a=2 else: a=-3 v=v0 track=v*t+0.5*a*(t**2) track=round(track)#取整數 v0=v+a*t s+=track forward_tracks.append(track) back_tracks=[-1,-1,-1,-2,-2,-2,-3,-3,-2,-2,-1] #20 return {"forward_tracks":forward_tracks,'back_tracks':back_tracks} def crack(driver):#封裝滑動的函數 # 2、點擊驗證人機按鈕,彈出沒有缺口的圖 button = driver.find_element_by_class_name('geetest_radar_tip_content') button.click() # 3、針對沒有缺口的圖片進行截圖 image1 = get_image(driver) # 4、點擊滑動按鈕,彈出有缺口的圖 slider_button = driver.find_element_by_class_name('geetest_slider_button') slider_button.click() # 5、針對有缺口的圖片進行截圖 image2 = get_image(driver) # 6、對比兩張圖片,找出缺口,即滑動的位移 distance = get_distance(image1, image2) # print(distance) # 7、按照人的行為行為習慣,把總位移切成一段段小的位移 traks_dic = get_tracks(distance) # 8、按照位移移動 slider_button = driver.find_element_by_class_name('geetest_slider_button') ActionChains(driver).click_and_hold(slider_button).perform() # 按住不放手 # 先向前移動 forward_tracks = traks_dic["forward_tracks"] back_tracks = traks_dic["back_tracks"] for forward_track in forward_tracks: ActionChains(driver).move_by_offset(xoffset=forward_track, yoffset=0).perform() # 短暫停頓,發現傻逼,移過了 time.sleep(0.2) # 先向后移動 for back_track in back_tracks: ActionChains(driver).move_by_offset(xoffset=back_track, yoffset=0).perform() # 抖一抖 ActionChains(driver).move_by_offset(xoffset=-4, yoffset=0).perform() ActionChains(driver).move_by_offset(xoffset=3, yoffset=0).perform() time.sleep(0.1) ActionChains(driver).move_by_offset(xoffset=-2, yoffset=0).perform() ActionChains(driver).move_by_offset(xoffset=3, yoffset=0).perform() time.sleep(0.3) ActionChains(driver).release().perform() # 松開鼠標 def login_cnblogs(username,pwd): driver = webdriver.Chrome() # 谷歌瀏覽器driver = webdriver.Chrome()#谷歌瀏覽器 try: driver.get('https://passport.cnblogs.com/user/signin')#博客園 driver.implicitly_wait(10)#隱形等待10秒 #1、輸入賬號、密碼,然后點擊登陸 input_user=driver.find_element_by_id('input1') input_pwd=driver.find_element_by_id('input2') login_button=driver.find_element_by_id('signin') input_user.send_keys(username)#輸入賬號 input_pwd.send_keys(pwd)#輸入密碼 login_button.click()#點擊登錄按鈕 # 調用 封裝滑動的函數 crack(driver) time.sleep(10) finally: driver.close() if __name__ == '__main__': login_cnblogs(username='Angelababy',pwd='sonoface')

#-*- coding:utf-8 -*- from selenium import webdriver from selenium.webdriver.support.wait import WebDriverWait from selenium.webdriver.common.by import By from selenium.webdriver.support import expected_conditions as EC import time import datetime import random from threading import Thread class Driver: def __init__(self, _webdriver, brow_name): self.brow = _webdriver self.name = brow_name class OP: def __init__(self, drivers ,start_page=1, end_page=11): self.start_page = start_page self.end_page = end_page self.drivers = [] for driver in drivers: self.drivers.append(driver) def my_print(self, *args): print(datetime.datetime.now(), args) def my_sleep(self): time.sleep(random.randint(1, 5)) def main(self, _driver): driver = _driver.brow() # 彈出瀏覽器 for i in range(self.start_page, self.end_page): if _driver.name == 'Safari': for j in range(0, 22 if i == 1 else 20): # driver.implicitly_wait(10) url = 'https://www.cnblogs.com/zhuminghui/default.html?page=' + str(i) self.my_print(url) driver.get(url) self.inner(driver, i, j, _driver.name) self.my_sleep() else: url = 'https://www.cnblogs.com/zhuminghui/default.html?page=' + str(i) self.my_print(url) driver.get(url) for j in range(0,22 if i == 1 else 20): self.inner(driver, i, j, _driver.name) # wait = WebDriverWait(driver, 5) # wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '#homepage1_HomePageDays_ctl00_ImageLink'))) self.my_sleep() # readmore_list = driver.find_elements_by_class_name('c_b_p_desc_readmore') driver.close() def inner(self, driver, i, j, name): try: postTitle2 = driver.find_elements_by_class_name('postTitle2') if len(postTitle2) < j + 1: self.my_print(name, j + 1, 'end') return if 'shadowsocks' in postTitle2[j].text or '****' in postTitle2[j].text: self.my_print('---<<< %s >>> page %s ,line %s <%s> ---' % (name, i, j + 1, postTitle2[j].text)) return postTitle2[j].click() self.my_sleep() wait = WebDriverWait(driver, 5) wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '#blog_post_info_block'))) driver.execute_script("arguments[0].scrollIntoView();", driver.find_element_by_id('blog_post_info_block')) title = driver.find_element_by_class_name('postTitle2') self.my_print('<<< %s >>>page %s ,line %s <%s>' % (name, i, j + 1, title.text)) self.my_sleep() driver.back() return True except Exception as e: self.my_print(name, 'Error:', e) driver.back() self.my_sleep() def start(self): ts = [] for driver in self.drivers: ts.append(Thread(target=self.main, args=(driver,))) for i in ts: i.start() for i in ts: i.join() if __name__ == '__main__': chrome = Driver(webdriver.Chrome, 'Chrome') firefox = Driver(webdriver.Firefox, 'Firefox') safari = Driver(webdriver.Safari, 'Safari') obj = OP(drivers=[chrome, firefox], start_page=1, end_page=13) obj.start()