現在的很多網站模擬登陸比較復雜,如果模擬點擊不是自己任務的目的,可以通過模擬點擊模擬登陸網站,進而爬取自己所需內容。
缺點:模擬登陸不穩定,有時可以正確登陸有時莫名其妙的出問題,這些問題有些是網頁沒有加載出來等,可以加入智能等待等。
所需插件:selenium,Chrome
1.創建session對象,清除session里的內容
req=requests.Session()#instantiation
req.headers.clear()
2.模擬登陸網站
模擬登陸網站后,瀏覽器中會帶有cookie信息,這個cookie信息就是我們所需的內容,網站只有檢測到這個信息,才認定我們是合法用法,會呈現給我們合法用戶可以看到的內容,我們才可以爬取所需內容。
比如:模擬登陸flicker網站
import requests
import json
import os
from time import sleep
from selenium import webdriver
path='D:\\flickimages'
req=requests.Session()#instantiation
req.headers.clear()
wd = webdriver.Chrome()
if not os.path.exists(path):
os.makedirs(path)
wd.get('https://login.yahoo.com/manage_account?done=https%3A%2F%2Fapi.login.yahoo.com%2Foauth2%2Frequest_auth%3Fclient_id%3Ddj0yJmk9NTJmMkVmOFo3RUVmJmQ9WVdrOVdXeGhVMWx3TjJFbWNHbzlNQS0tJnM9Y29uc3VtZXJzZWNyZXQmeD01OA--%26redirect_uri%3Dhttps%253A%252F%252Fwww.flickr.com%252Fsignin%252Fyahoo%252Foauth%252F%253Fredir%253Dhttps%25253A%25252F%25252Fwww.flickr.com%25252F%25253Fytcheck%25253D1%252526new_session%25253D1%26response_type%3Dcode%26scope%3Dopenid%252Csdpp-w%26nonce%3D3cf482a1a4fd106dd282768381a44220&redirect_uri=https%3A%2F%2Fwww.flickr.com%2Fsignin%2Fyahoo%2Foauth%2F%3Fredir%3Dhttps%253A%252F%252Fwww.flickr.com%252F%253Fytcheck%253D1%2526new_session%253D1&authMechanism=primary&eid=100')
wd.find_element_by_xpath('//*[@id="login-username"]').send_keys('username')
sleep(3)
wd.find_element_by_xpath('//*[@id="login-signin"]').submit()
sleep(3)
wd.find_element_by_xpath('//*[@id="login-passwd"]').send_keys('password')
sleep(3)
wd.find_element_by_xpath('//*[@id="login-signin"]').click()
sleep(10)#等待Cookies加載
cookies = wd.get_cookies()
for cookie in cookies:
req.cookies.set(cookie['name'],cookie['value'])#session里面包含了原先的登陸信息
上面的步驟用於獲取我們所需的cookie信息
模擬登陸flicker網站,並爬取自己所需的信息。爬取的是關鍵詞:tree的圖片,並下載下來
#use selenium to load web,ok!
import requests
import json
import os
from time import sleep
from selenium import webdriver
path='D:\\flickimages'
req=requests.Session()#instantiation
req.headers.clear()
wd = webdriver.Chrome()
if not os.path.exists(path):
os.makedirs(path)
wd.get('https://login.yahoo.com/manage_account?done=https%3A%2F%2Fapi.login.yahoo.com%2Foauth2%2Frequest_auth%3Fclient_id%3Ddj0yJmk9NTJmMkVmOFo3RUVmJmQ9WVdrOVdXeGhVMWx3TjJFbWNHbzlNQS0tJnM9Y29uc3VtZXJzZWNyZXQmeD01OA--%26redirect_uri%3Dhttps%253A%252F%252Fwww.flickr.com%252Fsignin%252Fyahoo%252Foauth%252F%253Fredir%253Dhttps%25253A%25252F%25252Fwww.flickr.com%25252F%25253Fytcheck%25253D1%252526new_session%25253D1%26response_type%3Dcode%26scope%3Dopenid%252Csdpp-w%26nonce%3D3cf482a1a4fd106dd282768381a44220&redirect_uri=https%3A%2F%2Fwww.flickr.com%2Fsignin%2Fyahoo%2Foauth%2F%3Fredir%3Dhttps%253A%252F%252Fwww.flickr.com%252F%253Fytcheck%253D1%2526new_session%253D1&authMechanism=primary&eid=100')
#wd.find_element_by_xpath('//*[@id="manage-account"]/a').click()
wd.find_element_by_xpath('//*[@id="login-username"]').send_keys('username')
sleep(3)
wd.find_element_by_xpath('//*[@id="login-signin"]').submit()
sleep(3)
wd.find_element_by_xpath('//*[@id="login-passwd"]').send_keys('password')
sleep(3)
wd.find_element_by_xpath('//*[@id="login-signin"]').click()
sleep(10)#等待Cookies加載
cookies = wd.get_cookies()
for cookie in cookies:
req.cookies.set(cookie['name'],cookie['value'])#session里面包含了原先的登陸信息
def get_imginfo(list,req,url):
#response =req.get(url)
response = req.get(url).text
jd = json.loads(response)
images_list = jd['photos']['photo']
if len(images_list)==0:
pass
else:
for image_list in images_list:
dict = {}
dict['id'] = image_list['id']
dict['server']=image_list['server']
dict['secret']=image_list['secret']
dict['count_comments'] = image_list['count_comments']
dict['imageurl'] = 'http://farm5.staticflickr.com/{0}/{1}_{2}.jpg'.format(dict['server'], dict['id'],dict['secret'])
list.append(dict)
return list
def save_images(list):
for i in range(0,len(list)):
img_url=list[i].get('imageurl')
img_name=os.path.join(path,list[i].get('id')+'.jpg')
if os.path.exists(img_name):
continue
with open(img_name,'wb') as fp:
fp.write(requests.get(img_url).content)
if __name__=='__main__':
list = []
#針對於flicker網站,圖片所存放的網址不是特別按照一定的規律
#每次進行記得查看符合的json的headedrs
start_url='https://api.flickr.com/services/rest?sort=relevance&parse_tags=1&content_type=7&extras=can_comment%2Ccount_comments%2Ccount_faves%2Cdescription%2Cisfavorite%2Clicense%2Cmedia%2Cneeds_interstitial%2Cowner_name%2Cpath_alias%2Crealname%2Crotation%2Curl_c%2Curl_l%2Curl_m%2Curl_n%2Curl_q%2Curl_s%2Curl_sq%2Curl_t%2Curl_z&per_page=25&page=2&lang=zh-Hant-HK&ytcheck=1&new_session=1&text=tree&viewerNSID=162943202%40N02&method=flickr.photos.search&csrf=1531144274%3Ac16v3htbthn%3A0372cbc990da575b6dabaeb9fbd2a1d8&api_key=b46c1c97c76e385a586bbc613642f41a&format=json&hermes=1&hermesClient=1&reqId=7a39fab7&nojsoncallback=1'
#for page in range(1,3):
# url=start_url.format(page)
list=get_imginfo(list,req,start_url)
save_images(list)
