代碼:
# -*- coding: utf-8 -*- """ Created on Fri Jul 13 16:13:52 2018 @author: a """ from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.support.wait import WebDriverWait import time import urllib.request import urllib.parse from urllib.error import URLError from urllib.error import HTTPError import requests chrome_options = webdriver.ChromeOptions() chrome_options.add_argument('--headless')#無界面模式 #下面的代碼是錯誤的使用方式 #browser = webdriver.Chrome(chrome_options=chrome_options,executable_path = "C:\Program Files (x86)\Google\Chrome\Application\chrome.exe") #下面的代碼是使用無界面模式 browser = webdriver.Chrome(chrome_options=chrome_options) #browser = webdriver.Chrome() print("xiaojie") url="https://passport.csdn.net/account/login" try: browser.get(url) data=browser.page_source print (len(data)) target=browser.find_element_by_xpath('/html/body/div[3]/div/div/div[2]/div/h3/a') print("target:",target) target.click() locator=(By.ID,'username') WebDriverWait(browser, 20, 0.5).until(EC.presence_of_element_located(locator)) username=browser.find_element_by_id('username') print ("username:",username) time.sleep(3) username.clear() username.send_keys('183247166@qq.com') password=browser.find_element_by_id('password') print ("password:",password) password.clear() password.send_keys('xxx') submit=browser.find_element_by_xpath('//*[@id="fm1"]/input[8]') print ("submit:",submit) submit.click() #time.sleep(10)#不用等待頁面刷新。這步操作是不需要的。 #保存cookie信息。 cookies=browser.get_cookies() except Exception as e: print("driver出現異常") print (e) finally: browser.close() print("over") None #將之前selenium登錄獲得的cookie信息保存到requests的會話當中。 s=requests.session() c = requests.cookies.RequestsCookieJar() for item in cookies: c.set(item["name"],item["value"]) s.cookies.update(c) headers={ "User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.26 Safari/537.36 Core/1.63.5193.400 QQBrowser/10.0.1066.400" } url2="https://download.csdn.net/my"#這是從網頁登錄以后,才能進入的個人空間。 try: print("開始爬網頁") response=s.get(url2,headers=headers) data=response.text print (len(data)) print (data) fhandle=open("./驗證cookie能使用的網頁,斷網打開.html","w",encoding='utf-8') fhandle.write(data.encode('utf-8').decode()) fhandle.close() print ("驗證cookie能使用的網頁,斷網打開.html已經成功生成") except Exception as e: print(e)
這里面,取出cookie信息以后,瀏覽器就關閉了。然后用requests.session去訪問個人中心頁面,攜帶cookie信息,發現可以成功保持個人中心頁面!
我做過其它方面的實驗,就是,如下的代碼:
# -*- coding: utf-8 -*- """ Created on Fri Jul 13 16:13:52 2018 @author: a """ from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.support.wait import WebDriverWait import time import urllib.request import urllib.parse from urllib.error import URLError from urllib.error import HTTPError import requests chrome_options = webdriver.ChromeOptions() chrome_options.add_argument('--headless')#無界面模式 #下面的代碼是錯誤的使用方式 #browser = webdriver.Chrome(chrome_options=chrome_options,executable_path = "C:\Program Files (x86)\Google\Chrome\Application\chrome.exe") #下面的代碼是使用無界面模式 #browser = webdriver.Chrome(chrome_options=chrome_options) browser = webdriver.Chrome() print("xiaojie") url="https://passport.csdn.net/account/login" try: browser.get(url) data=browser.page_source print (len(data)) curpage_url=browser.current_url print (curpage_url) target=browser.find_element_by_xpath('/html/body/div[3]/div/div/div[2]/div/h3/a') print("target:",target) target.click() locator=(By.ID,'username') WebDriverWait(browser, 20, 0.5).until(EC.presence_of_element_located(locator)) username=browser.find_element_by_id('username') print ("username:",username) time.sleep(3) username.clear() username.send_keys('183247166@qq.com') password=browser.find_element_by_id('password') print ("password:",password) password.clear() password.send_keys('xxx') submit=browser.find_element_by_xpath('//*[@id="fm1"]/input[8]') print ("submit:",submit) submit.click() curpage_url=browser.current_url print (curpage_url) #time.sleep(10)#不用等待頁面刷新。這步操作是不需要的。 #保存cookie信息。 cookie =[item["name"] + ":" + item["value"] for item in browser.get_cookies()] print (cookie) cookiestr = ';'.join(item for item in cookie) cook_map = {} for item in cookie: str = item.split(':') cook_map[str[0]] = str[1] print (cook_map) cookies = requests.utils.cookiejar_from_dict(cook_map, cookiejar=None, overwrite=True) s = requests.Session() s.cookies = cookies #使用urllib爬取網頁 #下面的代碼並不能成功執行。 headers=("User-Agent","Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.26 Safari/537.36 Core/1.63.5193.400 QQBrowser/10.0.1066.400") opener=urllib.request.build_opener(urllib.request.HTTPCookieProcessor(cookies)) opener.addheaders=[headers] url2="https://download.csdn.net/my"#這是從網頁登錄以后,才能進入的個人空間。 try: urllib.request.install_opener(opener) response=urllib.request.urlopen(url2) print("開始爬網頁") fhandle=open("./驗證cookie能使用的網頁,斷網打開.html","w",encoding='utf-8') fhandle.write(data.encode('utf-8').decode()) fhandle.write(data) fhandle.close() print ("驗證cookie能使用的網頁,斷網打開.html已經成功生成") except HTTPError as e: print("出現HTTP異常") print(e.code) print(e.reason) except URLError as e: print("出現URL異常") print(e.reason) except Exception as e: print("driver出現異常") print (e) finally: browser.close() print("over") None
我寫的上面的代碼是有問題的。打開保存的網頁是:說明沒有保持登錄狀態。
如果還使用urllib.request.urlopen,並且將selenium模擬登錄之后的cookie轉變為CookieJar的方式,會發現不能保持登錄狀態訪問。而精通Python網絡爬蟲一書中是要先創建CookieJar對象,創建全局的opener,然后再登錄,這樣的話,cookie會保存到全局的opener中,之后就能保持登錄狀態,繼續使用urllib。但是,我這里先用selenimu模擬登錄,之后才是基於得到的cookie創建CookieJar,然后再使用urllib。為什么會不行呢,
參考如下博客:
https://blog.csdn.net/warrior_zhang/article/details/50198699
python 利用selenium模擬登錄帳號驗證網站並獲取cookie
其中包含兩步:
6.通過對象的方法獲取當前訪問網站的session cookie:
#get the session cookie cookie = [item["name"] + "=" + item["value"] for item in sel.get_cookies()] #print cookie
cookiestr = ';'.join(item for item in cookie) print cookiestr
|
7.得到cookie之后,就可以通過urllib2訪問相應的網站,並可實現網頁爬取等工作:
import urllib2 print '%%%using the urllib2 !!' homeurl = sel.current_url print 'homeurl: %s' % homeurl headers = {'cookie':cookiestr} req = urllib2.Request(homeurl, headers = headers) try: response = urllib2.urlopen(req) text = response.read() fd = open('homepage', 'w') fd.write(text) fd.close() print '###get home page html success!!' except: print '### get home page html error!!' |
它是將cookie放在headers中,之后再使用urllib。我們測試一下:
編寫代碼:
# -*- coding: utf-8 -*- """ Created on Fri Jul 13 16:13:52 2018 @author: a """ from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.support.wait import WebDriverWait import time import urllib.request import urllib.parse from urllib.error import URLError from urllib.error import HTTPError chrome_options = webdriver.ChromeOptions() chrome_options.add_argument('--headless')#無界面模式 #下面的代碼是錯誤的使用方式 #browser = webdriver.Chrome(chrome_options=chrome_options,executable_path = "C:\Program Files (x86)\Google\Chrome\Application\chrome.exe") #下面的代碼是使用無界面模式 browser = webdriver.Chrome(chrome_options=chrome_options) #browser = webdriver.Chrome() print("xiaojie") url="https://passport.csdn.net/account/login" try: browser.get(url) data=browser.page_source print (len(data)) curpage_url=browser.current_url print (curpage_url) target=browser.find_element_by_xpath('/html/body/div[3]/div/div/div[2]/div/h3/a') print("target:",target) target.click() locator=(By.ID,'username') WebDriverWait(browser, 20, 0.5).until(EC.presence_of_element_located(locator)) username=browser.find_element_by_id('username') print ("username:",username) time.sleep(3) username.clear() username.send_keys('183247166@qq.com') password=browser.find_element_by_id('password') print ("password:",password) password.clear() password.send_keys('xxx') submit=browser.find_element_by_xpath('//*[@id="fm1"]/input[8]') print ("submit:",submit) submit.click() curpage_url=browser.current_url print (curpage_url) #time.sleep(10)#不用等待頁面刷新。這步操作是不需要的。 #保存cookie信息。 cookie =[item["name"] + ":" + item["value"] for item in browser.get_cookies()] print (cookie) cookiestr = ';'.join(item for item in cookie) #使用urllib爬取網頁 #下面的代碼並不能成功執行。 headers={ "User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.26 Safari/537.36 Core/1.63.5193.400 QQBrowser/10.0.1066.400", "cookie":cookiestr } url2="https://download.csdn.net/my"#這是從網頁登錄以后,才能進入的個人空間。 try: req=urllib.request.Request(url2,headers=headers) response=urllib.request.urlopen(req) data=response.read() print("開始爬網頁") fhandle=open("./驗證cookie能使用的網頁,斷網打開.html","wb") fhandle.write(data) fhandle.close() print ("驗證cookie能使用的網頁,斷網打開.html已經成功生成") except HTTPError as e: print("出現HTTP異常") print(e.code) print(e.reason) except URLError as e: print("出現URL異常") print(e.reason) except Exception as e: print("driver出現異常") print (e) finally: browser.close() print("over") None
結果還是不行。具體原因有待研究。