# 前提:
#
# 通常,很多網站需要登錄才能進行瀏覽,所以在爬取這些網站時,也需要進行登錄,並拿取登錄時的cookie
#
# 登錄網頁,服務器會給客戶端一個牌子cookie
#
# 訪問登錄頁面時,帶着牌子進行請求才能返回響應
#
# 登錄界面的爬取
# 做法:
# 找到牌子,帶着牌子進行請求
# cookie有的在請求頭里
# 如下是在登錄后的頁面中找到請求頭里的cookie,然后進行請求,訪問其含登陸信息的頁面
import urllib.request
import urllib.parse
#將帶cookie請求頭信息添加到請求對象中取
headers = {
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
# "Accept-Encoding": "gzip, deflate",
# "Accept-Language": "zh-CN,zh;q=0.9",
"Connection": "keep-alive",
"Cookie": "anonymid=js2wkb2xx3aylq; depovince=GW; _r01_=1; JSESSIONID=abcwlggMRpipBajTf3LJw; ick_login=7c1fa03b-b8cf-408e-998a-6f7d34abd0d7; t=49a13f402543c813e2c6d684147af8133; societyguester=49a13f402543c813e2c6d684147af8133; id=969726303; xnsid=1a234058; jebecookies=bc2283c3-38f0-468f-b754-8f6550e1b52a|||||; ver=7.0; loginfrom=null; springskin=set; jebe_key=6b5b8da6-ae2c-4d26-ab60-66bb55a70491%7C1b833888a1eb6aca75ec4170a8e04c2d%7C1550044677566%7C1%7C1550044653252; vip=1; ch_id=10013; _ga=GA1.2.222558484.1550044669; _gid=GA1.2.491107391.1550044669; wp_fold=0",
"Host": "www.renren.com",
"Referer": "http://www.renren.com/969726303/profile",
"Upgrade-Insecure-Requests": "1",
"User-Agent":" Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36",
}
#需要登錄驗證的網頁網址
url = "http://www.renren.com/969726303/profile?v=info_timeline"
request = urllib.request.Request(url,headers=headers)
response = urllib.request.urlopen(request)
with open("renren.html","wb")as tf:
tf.write(response.read())
tf.close()
# 如下是在登錄時就直接抓取登錄時服務器給的cookie數據,然后在之后訪問其他需要登錄驗證的網頁時帶着cookie進行訪問就行了