注意:模擬登陸時,必須保證settings.py里的COOKIES_ENABLED(Cookies中間件) 處於開啟狀態
COOKIES_ENABLED = True
或# COOKIES_ENABLED = False
策略一:直接POST數據(比如需要登陸的賬戶信息)
只要是需要提供post數據的,就可以用這種方法。下面示例里post的數據是賬戶密碼:
- 可以使用
yield scrapy.FormRequest(url, formdata, callback)
方法發送POST請求。 - 如果希望程序執行一開始就發送POST請求,可以重寫Spider類的
start_requests(self)
方法,並且不再調用start_urls
里的url。
class mySpider(scrapy.Spider):
# start_urls = ["http://www.example.com/"]
def start_requests(self):
url = 'http://www.renren.com/PLogin.do' #從源碼中form表單提取的action網址
# FormRequest 是Scrapy發送POST請求的方法
yield scrapy.FormRequest(
url = url,
formdata = {"email" : "mr_mao_hacker@163.com", "password" : "axxxxxxxe"},
callback = self.parse_page
)
def parse_page(self, response):
# do something
# 業務邏輯
策略二:標准的模擬登陸步驟
正統模擬登錄方法:
- 首先發送登錄頁面的get請求,獲取到頁面里的登錄必須的參數(比如說zhihu登陸界面的 _xsrf)
- 然后和賬戶密碼一起post到服務器,登錄成功
- 使用
FormRequest.from_response()
方法[模擬用戶登錄]
import scrapy
class LoginSpider(scrapy.Spider):
name = 'example.com'
start_urls = ['http://www.example.com/users/login.php']
def parse(self, response):
return scrapy.FormRequest.from_response(
response,
formdata={'username': 'john', 'password': 'secret'},
callback=self.after_login
)
def after_login(self, response):
# check login succeed before going on
if "authentication failed" in response.body:
self.log("Login failed", level=log.ERROR)
return
# continue scraping with authenticated session...
模擬瀏覽器登錄
start_requests()方法,可以返回一個請求給爬蟲的起始網站,這個返回的請求相當於start_urls,start_requests()返回的請求會替代start_urls里的請求
Request()get請求,可以設置,url、cookie、回調函數
FormRequest.from_response()表單post提交,第一個必須參數,上一次響應cookie的response對象,其他參數,cookie、url、表單內容等
正統模擬登錄方法
import scrapy
# 正統模擬登錄方法:
# 首先發送登錄頁面的get請求,獲取到頁面里的登錄必須的參數,比如說zhihu的 _xsrf
# 然后和賬戶密碼一起post到服務器,登錄成功
# 第二種標准
def parse(self, response):
print(response.body.decode('utf-8'), "@@" * 40)
yield scrapy.FormRequest.from_response(response,formdata={
"email": "18588403840",
"origURL":"http://www.renren.com/422167102/profile",
"domain": "renren.com",
"key_id": "1",
"captcha_type": "web_login",
"password": "97bfc03b0eec4df7c76eaec10cd08ea57b01eefd0c0ffd4c0e5061ebd66460d9",
"rkey": "26615a8e93fee56fc1fb3d679afa3cc4",
"f": ""
},
dont_filter=True,
headers=self.headers,
callback=self.get_page)
def get_page(self, response):
print("===================", response.url)
print(response.body.decode('utf-8'))
url = "http://www.renren.com/353111356/profile"
yield scrapy.Request(url, callback=self.get_info)
def get_info(self, response):
print('*******' * 30)
print(response.body.decode('utf-8'))
yield Request()可以將一個新的請求返回給爬蟲執行
在發送請求時cookie的操作, meta={'cookiejar':1}表示開啟cookie記錄,首次請求時寫在Request()里 meta={'cookiejar':response.meta['cookiejar']}表示使用上一次response的cookie,寫在FormRequest.from_response()里post授權 meta={'cookiejar':True}表示使用授權后的cookie訪問需要登錄查看的頁面
import scrapy
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
class MyrenSpider(CrawlSpider):
name = 'myren'
allowed_domains = ['renren.com']
start_urls = ["http://www.renren.com/353111356/profile"]
rules = [Rule(LinkExtractor(allow=('(\d+)/profile')), callback='get_info', follow=True)]
headers = {
"Accept": "*/*",
"Connection": "keep-alive",
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.111 Safari/537.36",
}
def start_requests(self):
yield scrapy.Request(url="http://www.renren.com/", meta={'cookiejar': 1}, callback=self.post_login)
# 第二種標准
def post_login(self, response):
yield scrapy.FormRequest.from_response(response,
url="http://www.renren.com/PLogin.do",
meta={'cookiejar': response.meta['cookiejar']},
# 在之前需要打開 meta = {'cookiejar' : 1}
headers=self.headers,
formdata={
"email": "18588403840",
"password": "Changeme_123"
},
dont_filter=True,
callback=self.after_login)
def after_login(self, response):
for url in self.start_urls:
# yield self.make_requests_from_url(url)
yield scrapy.Request(url, meta={'cookiejar': response.meta['cookiejar']})
def get_info(self, response):
print('*******' * 30)
print(response.body.decode('utf-8'))
def _requests_to_follow(self, response):
"""重寫加入cookiejar的更新"""
if not isinstance(response, HtmlResponse):
return
seen = set()
for n, rule in enumerate(self._rules):
links = [l for l in rule.link_extractor.extract_links(response) if l not in seen]
if links and rule.process_links:
links = rule.process_links(links)
for link in links:
seen.add(link)
r = Request(url=link.url, callback=self._response_downloaded)
# 下面這句是我重寫的
r.meta.update(rule=n, link_text=link.text, cookiejar=response.meta['cookiejar'])
yield rule.process_request(r)
策略三:直接使用保存登陸狀態的Cookie模擬登陸
如果實在沒辦法了,可以用這種方法模擬登錄,雖然麻煩一點,但是成功率100%
ChangeCookies 將cookie解析成字典形式
class transCookie:
def __init__(self, cookie):
self.cookie = cookie
def stringToDict(self):
'''
將從瀏覽器上Copy來的cookie字符串轉化為Scrapy能使用的Dict
:return:
'''
itemDict = {}
items = self.cookie.split(';')
for item in items:
key = item.split('=')[0].strip()
value = item.split('=')[1]
itemDict[key] = value
return itemDict
if __name__ == "__main__":
cookie = "你的cookie"
trans = transCookie(cookie)
print(trans.stringToDict())
將解析好的cookie格式放入請求
# -*- coding: utf-8 -*-
import scrapy
class RenrenSpider(scrapy.Spider):
name = "renren"
allowed_domains = ["renren.com"]
start_urls = [
'http://www.renren.com/111111',
'http://www.renren.com/222222',
'http://www.renren.com/333333',
] #開始請求url列表
cookies = {
"anonymid" : "ixrna3fysufnwv",
"_r01_" : "1",
"ap" : "327550029",
"JSESSIONID" : "abciwg61A_RvtaRS3GjOv",
"depovince" : "GW",
"springskin" : "set",
"jebe_key" : "f6fb270b-d06d-42e6-8b53-e67c3156aa7e%7Cc13c37f53bca9e1e7132d4b58ce00fa3%7C1484060607478%7C1%7C1486198628950",
"t" : "691808127750a83d33704a565d8340ae9",
"societyguester" : "691808127750a83d33704a565d8340ae9",
"id" : "327550029",
"xnsid" : "f42b25cf",
"loginfrom" : "syshome"
}
# 可以重寫Spider類的start_requests方法,附帶Cookie值,發送POST請求
def start_requests(self):
return [scrapy.FormRequest(url, cookies = self.cookies, callback = self.parse)]
# 處理響應內容
def parse(self, response):
print "===========" + response.url
with open("deng.html", "w") as filename:
filename.write(response.body)
策略四 : 使用selenium插件(全能)
1 spider.browser.page_source 獲取響應的源代碼
2 session.get(request.url).text 獲取響應的源代碼
3 requests采用session管理cookie
4 urllib 采用cookieJar管理cookie
模擬登錄淘寶
class TaobaoSpider(scrapy.Spider):
name = 'mytaobao'
allowed_domains = ['taobao.com']
start_urls = ['https://login.m.taobao.com/login.htm',
"http://h5.m.taobao.com/mlapp/olist.html?spm=a2141.7756461.2.6"]
def __init__(self): # 初始化
self.browser = None
self.cookies = None
super(TaobaoSpider, self).__init__() # 傳遞給父類
def parse(self, response):
# 打印鏈接,打印網頁源代碼
print(response.url)
print(response.body.decode("utf-8", "ignore"))
#中間件middleware 自定義LoginMiddleware登錄
from scrapy import signals
from selenium import webdriver
from scrapy.http import HtmlResponse # 網頁響應
import requests
import time
class LoginMiddleware(object):
'''
找到password username輸入框並send_keys
點擊登錄並抓取cookie,spider.browser.get_cookies()
返回頁面信息,HtmlResponse
'''
def process_request(self, request, spider):
if spider.name == "mytaobao": # 指定僅僅處理這個名稱的爬蟲
if request.url.find("login") != -1: # 判斷是否登陸頁面
mobilesetting = {"deviceName": "iPhone 6 Plus"}
options = webdriver.ChromeOptions() # 瀏覽器選項
options.add_experimental_option("mobileEmulation", mobilesetting) # 模擬手機
spider.browser = webdriver.Chrome(chrome_options=options) # 創建一個瀏覽器對象
spider.browser.set_window_size(400, 800) # 配置手機大小
spider.browser.get(request.url) # 爬蟲訪問鏈接
time.sleep(3) #必須要睡下因為考慮到輸入:用戶名密碼 要時間
print("login訪問", request.url)
username = spider.browser.find_element_by_id("username")
password = spider.browser.find_element_by_id("password")
time.sleep(1)
username.send_keys("2403239393@qq.com") # 賬戶
time.sleep(2)
password.send_keys("bama100") # 密碼
time.sleep(2)
spider.browser.find_element_by_id("btn-submit").click()
time.sleep(4)
spider.cookies = spider.browser.get_cookies() # 抓取全部的cookie
# spider.browser.close()
return HtmlResponse(url=spider.browser.current_url, # 當前連接
body=spider.browser.page_source, # 源代碼
encoding="utf-8") # 返回頁面信息
else:#登錄后則執行
'''
1 采用requests.session保存cookie
2 設置cookie session.cookie.set(name,value)
3 清空headers session.headers.clear()
4 發起get請求 session.get(url)
'''
print("request 訪問")
session = requests.session() # 會話
for cookie in spider.cookies:
session.cookies.set(cookie['name'], cookie["value"])
session.headers.clear() # 清空頭
newpage = session.get(request.url)
print("---------------------")
print(request.url)
print("---------------------")
print(newpage.text)
print("---------------------")
# 頁面
time.sleep(3)
return HtmlResponse(url=request.url, # 當前連接
body=newpage.text, # 源代碼
encoding="utf-8") # 返回頁面信息