利用模拟点击获取cookie信息,模拟登陆网页


现在的很多网站模拟登陆比较复杂,如果模拟点击不是自己任务的目的,可以通过模拟点击模拟登陆网站,进而爬取自己所需内容。

缺点:模拟登陆不稳定,有时可以正确登陆有时莫名其妙的出问题,这些问题有些是网页没有加载出来等,可以加入智能等待等。

 所需插件:selenium,Chrome

1.创建session对象,清除session里的内容

req=requests.Session()#instantiation

req.headers.clear()

2.模拟登陆网站

模拟登陆网站后,浏览器中会带有cookie信息,这个cookie信息就是我们所需的内容,网站只有检测到这个信息,才认定我们是合法用法,会呈现给我们合法用户可以看到的内容,我们才可以爬取所需内容。

比如:模拟登陆flicker网站

import requests
import json
import os
from time import sleep
from selenium import webdriver
path='D:\\flickimages'
req=requests.Session()#instantiation
req.headers.clear()
wd = webdriver.Chrome()
if not os.path.exists(path):
os.makedirs(path)
wd.get('https://login.yahoo.com/manage_account?done=https%3A%2F%2Fapi.login.yahoo.com%2Foauth2%2Frequest_auth%3Fclient_id%3Ddj0yJmk9NTJmMkVmOFo3RUVmJmQ9WVdrOVdXeGhVMWx3TjJFbWNHbzlNQS0tJnM9Y29uc3VtZXJzZWNyZXQmeD01OA--%26redirect_uri%3Dhttps%253A%252F%252Fwww.flickr.com%252Fsignin%252Fyahoo%252Foauth%252F%253Fredir%253Dhttps%25253A%25252F%25252Fwww.flickr.com%25252F%25253Fytcheck%25253D1%252526new_session%25253D1%26response_type%3Dcode%26scope%3Dopenid%252Csdpp-w%26nonce%3D3cf482a1a4fd106dd282768381a44220&redirect_uri=https%3A%2F%2Fwww.flickr.com%2Fsignin%2Fyahoo%2Foauth%2F%3Fredir%3Dhttps%253A%252F%252Fwww.flickr.com%252F%253Fytcheck%253D1%2526new_session%253D1&authMechanism=primary&eid=100')
wd.find_element_by_xpath('//*[@id="login-username"]').send_keys('username')
sleep(3)
wd.find_element_by_xpath('//*[@id="login-signin"]').submit()
sleep(3)
wd.find_element_by_xpath('//*[@id="login-passwd"]').send_keys('password')
sleep(3)
wd.find_element_by_xpath('//*[@id="login-signin"]').click()
sleep(10)#等待Cookies加载
cookies = wd.get_cookies()
for cookie in cookies:
req.cookies.set(cookie['name'],cookie['value'])#session里面包含了原先的登陆信息

上面的步骤用于获取我们所需的cookie信息

模拟登陆flicker网站,并爬取自己所需的信息。爬取的是关键词:tree的图片,并下载下来

#use selenium to load web,ok!
import requests
import json
import os
from time import sleep
from selenium import webdriver
path='D:\\flickimages'
req=requests.Session()#instantiation
req.headers.clear()
wd = webdriver.Chrome()
if not os.path.exists(path):
os.makedirs(path)
wd.get('https://login.yahoo.com/manage_account?done=https%3A%2F%2Fapi.login.yahoo.com%2Foauth2%2Frequest_auth%3Fclient_id%3Ddj0yJmk9NTJmMkVmOFo3RUVmJmQ9WVdrOVdXeGhVMWx3TjJFbWNHbzlNQS0tJnM9Y29uc3VtZXJzZWNyZXQmeD01OA--%26redirect_uri%3Dhttps%253A%252F%252Fwww.flickr.com%252Fsignin%252Fyahoo%252Foauth%252F%253Fredir%253Dhttps%25253A%25252F%25252Fwww.flickr.com%25252F%25253Fytcheck%25253D1%252526new_session%25253D1%26response_type%3Dcode%26scope%3Dopenid%252Csdpp-w%26nonce%3D3cf482a1a4fd106dd282768381a44220&redirect_uri=https%3A%2F%2Fwww.flickr.com%2Fsignin%2Fyahoo%2Foauth%2F%3Fredir%3Dhttps%253A%252F%252Fwww.flickr.com%252F%253Fytcheck%253D1%2526new_session%253D1&authMechanism=primary&eid=100')
#wd.find_element_by_xpath('//*[@id="manage-account"]/a').click()
wd.find_element_by_xpath('//*[@id="login-username"]').send_keys('username')
sleep(3)
wd.find_element_by_xpath('//*[@id="login-signin"]').submit()
sleep(3)
wd.find_element_by_xpath('//*[@id="login-passwd"]').send_keys('password')
sleep(3)
wd.find_element_by_xpath('//*[@id="login-signin"]').click()
sleep(10)#等待Cookies加载
cookies = wd.get_cookies()
for cookie in cookies:
req.cookies.set(cookie['name'],cookie['value'])#session里面包含了原先的登陆信息

def get_imginfo(list,req,url):
#response =req.get(url)
response = req.get(url).text
jd = json.loads(response)
images_list = jd['photos']['photo']
if len(images_list)==0:
pass
else:
for image_list in images_list:
dict = {}
dict['id'] = image_list['id']
dict['server']=image_list['server']
dict['secret']=image_list['secret']
dict['count_comments'] = image_list['count_comments']
dict['imageurl'] = 'http://farm5.staticflickr.com/{0}/{1}_{2}.jpg'.format(dict['server'], dict['id'],dict['secret'])
list.append(dict)
return list
def save_images(list):
for i in range(0,len(list)):
img_url=list[i].get('imageurl')
img_name=os.path.join(path,list[i].get('id')+'.jpg')
if os.path.exists(img_name):
continue
with open(img_name,'wb') as fp:
fp.write(requests.get(img_url).content)
if __name__=='__main__':
list = []
#针对于flicker网站,图片所存放的网址不是特别按照一定的规律
#每次进行记得查看符合的json的headedrs
start_url='https://api.flickr.com/services/rest?sort=relevance&parse_tags=1&content_type=7&extras=can_comment%2Ccount_comments%2Ccount_faves%2Cdescription%2Cisfavorite%2Clicense%2Cmedia%2Cneeds_interstitial%2Cowner_name%2Cpath_alias%2Crealname%2Crotation%2Curl_c%2Curl_l%2Curl_m%2Curl_n%2Curl_q%2Curl_s%2Curl_sq%2Curl_t%2Curl_z&per_page=25&page=2&lang=zh-Hant-HK&ytcheck=1&new_session=1&text=tree&viewerNSID=162943202%40N02&method=flickr.photos.search&csrf=1531144274%3Ac16v3htbthn%3A0372cbc990da575b6dabaeb9fbd2a1d8&api_key=b46c1c97c76e385a586bbc613642f41a&format=json&hermes=1&hermesClient=1&reqId=7a39fab7&nojsoncallback=1'
#for page in range(1,3):
# url=start_url.format(page)
list=get_imginfo(list,req,start_url)
save_images(list)

 


免责声明!

本站转载的文章为个人学习借鉴使用,本站对版权不负任何法律责任。如果侵犯了您的隐私权益,请联系本站邮箱yoyou2525@163.com删除。



 
粤ICP备18138465号  © 2018-2025 CODEPRJ.COM