1、古詩文網直接登錄時,用瀏覽器F12抓取登錄接口的入參,我們可以看到框起來的key對應的value是動態參數生成的,需獲取到;
2、登錄接口入參的值一般是登錄接口返回的原數據值,若刷新后接口與對應源碼(element)的值存在一個為空一個有值,那么久看下是否存在ajax請求,再獲取動態參數的值
3、我們獲取動態參數的值,使用到etree中的xpath進行解析
from TestCase.Api_Review.ClassCode import Chaojiying_Client
from lxml import etree
import requests
import os
s = requests.Session()
# 新建文件夾
if not os.path.exists('./gushiwenLibs'):
os.makedirs('./gushiwenLibs')
# 對驗證碼圖片進行抓捕及識別
url = 'https://so.gushiwen.cn/user/login.aspx?from=http://so.gushiwen.cn/user/collect.aspx'
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36'
}
page_text = s.get(url=url,headers=headers,proxies=None).text
tree = etree.HTML(page_text)
img_url = "https://so.gushiwen.cn/RandCode.ashx"+tree.xpath('//*[@id="imgCode"]/@src')[0]
__VIEWSTATE = tree.xpath('//*[@id="__VIEWSTATE"]/@value')[0]
__VIEWSTATEGENERATOR = tree.xpath('//*[@id="__VIEWSTATEGENERATOR"]/@value')[0]
4、登錄界面的圖片驗證碼,我們先獲取對應的圖形驗證碼,下載到本地,然后再使用第三方平台進行提取
參考此鏈接:Python+Request庫+第三方平台實現驗證碼識別示例
img_src = s.get(url=img_url,headers=headers).content
# 圖片存儲的路徑
fileName = './gushiwenLibs/'+'code_img_data.jpg'
with open(fileName, 'wb') as fp:
fp.write(img_src)
# 使用超級鷹平台實現驗證碼識別
chaojiying = Chaojiying_Client('TeacherTao', 'TeacherTao', '96001')
with open(fileName, 'rb') as fp:
img = fp.read()
result = chaojiying.PostPic(img, 1004)['pic_str']
# print(result)
5、最后再使用登錄接口發起請求,我們使用Session進行登錄的,因為請求頭中攜帶Cookies進行登錄了
# 登錄Url
url_login = 'https://so.gushiwen.cn/user/login.aspx?from=http://so.gushiwen.cn/user/collect.aspx'
data = {
'__VIEWSTATE': __VIEWSTATE,
'__VIEWSTATEGENERATOR': __VIEWSTATEGENERATOR,
'from': 'http://so.gushiwen.cn/user/collect.aspx',
'email': '18126248212',
'pwd': 'qqq123',
'code': result,
'denglu': '登錄',
}
post_text = s.post(url_login,data=data,headers=headers)
# print(post_text.text)
fileName1 = './gushiwenLibs/'+'gushiren.html'
with open(fileName1, 'w',encoding='utf-8') as fp:
fp.write(post_text.text)
6、整個項目的源碼:
from TestCase.Api_Review.ClassCode import Chaojiying_Client
from lxml import etree
import requests
import os
s = requests.Session()
# 新建文件夾
if not os.path.exists('./gushiwenLibs'):
os.makedirs('./gushiwenLibs')
# 對驗證碼圖片進行抓捕及識別
url = 'https://so.gushiwen.cn/user/login.aspx?from=http://so.gushiwen.cn/user/collect.aspx'
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36'
}
page_text = s.get(url=url,headers=headers,proxies=None).text
tree = etree.HTML(page_text)
img_url = "https://so.gushiwen.cn/RandCode.ashx"+tree.xpath('//*[@id="imgCode"]/@src')[0]
__VIEWSTATE = tree.xpath('//*[@id="__VIEWSTATE"]/@value')[0]
__VIEWSTATEGENERATOR = tree.xpath('//*[@id="__VIEWSTATEGENERATOR"]/@value')[0]
img_src = s.get(url=img_url,headers=headers).content
# 圖片存儲的路徑
fileName = './gushiwenLibs/'+'code_img_data.jpg'
with open(fileName, 'wb') as fp:
fp.write(img_src)
# 使用超級鷹平台實現驗證碼識別
chaojiying = Chaojiying_Client('TeacherTao', 'TeacherTao', '96001')
with open(fileName, 'rb') as fp:
img = fp.read()
result = chaojiying.PostPic(img, 1004)['pic_str']
print(result)
# 登錄Url
url_login = 'https://so.gushiwen.cn/user/login.aspx?from=http://so.gushiwen.cn/user/collect.aspx'
data = {
'__VIEWSTATE': __VIEWSTATE,
'__VIEWSTATEGENERATOR': __VIEWSTATEGENERATOR,
'from': 'http://so.gushiwen.cn/user/collect.aspx',
'email': '賬號',
'pwd': '密碼',
'code': result,
'denglu': '登錄',
}
post_text = s.post(url_login,data=data,headers=headers)
# print(post_text.text)
fileName1 = './gushiwenLibs/'+'gushiren.html'
with open(fileName1, 'w',encoding='utf-8') as fp:
fp.write(post_text.text)