爬取目標
1.本次代碼是在python2上運行通過的,python3的最需改2行代碼,用到其它python模塊
- selenium 2.53.6 +firefox 44
- BeautifulSoup
- requests
2.爬取目標網站,我的博客:https://home.cnblogs.com/u/yoyoketang
爬取內容:爬我的博客的所有粉絲的名稱,並保存到txt
3.由於博客園的登錄是需要人機驗證的,所以是無法直接用賬號密碼登錄,需借助selenium登錄

selenium獲取cookies
1.大前提:先手工操作瀏覽器,登錄我的博客,並記住密碼
(保證關掉瀏覽器后,下次打開瀏覽器訪問我的博客時候是登錄狀態)
2.selenium默認啟動瀏覽器是一個空的配置,默認不加載配置緩存文件,這里先得找到對應瀏覽器的配置文件地址,以火狐瀏覽器為例
3.使用driver.get_cookies()方法獲取瀏覽器的cookies
# coding:utf-8
import requests
from selenium import webdriver
from bs4 import BeautifulSoup
import re
import time
# firefox瀏覽器配置文件地址
profile_directory = r'C:\Users\admin\AppData\Roaming\Mozilla\Firefox\Profiles\yn80ouvt.default'
# 加載配置
profile = webdriver.FirefoxProfile(profile_directory)
# 啟動瀏覽器配置
driver = webdriver.Firefox(profile)
driver.get("https://home.cnblogs.com/u/yoyoketang/followers/")
time.sleep(3)
cookies = driver.get_cookies() # 獲取瀏覽器cookies
print(cookies)
driver.quit()
(注:要是這里腳本啟動瀏覽器后,打開的博客頁面是未登錄的,后面內容都不用看了,先檢查配置文件是不是寫錯了)
requests添加登錄的cookies
1.瀏覽器的cookies獲取到后,接下來用requests去建一個session,在session里添加登錄成功后的cookies
s = requests.session() # 新建session
# 添加cookies到CookieJar
c = requests.cookies.RequestsCookieJar()
for i in cookies:
c.set(i["name"], i['value'])
s.cookies.update(c) # 更新session里cookies
計算粉絲數和分頁總數
1.由於我的粉絲的數據是分頁展示的,這里一次只能請求到45個,所以先獲取粉絲總數,然后計算出總的頁數
# 發請求
r1 = s.get("https://home.cnblogs.com/u/yoyoketang/relation/followers")
soup = BeautifulSoup(r1.content, "html.parser")
# 抓取我的粉絲數
fensinub = soup.find_all(class_="current_nav")
print fensinub[0].string
num = re.findall(u"我的粉絲\((.+?)\)", fensinub[0].string)
print u"我的粉絲數量:%s"%str(num[0])
# 計算有多少頁,每頁45條
ye = int(int(num[0])/45)+1
print u"總共分頁數:%s"%str(ye)
保存粉絲名到txt
# 抓取第一頁的數據
fensi = soup.find_all(class_="avatar_name")
for i in fensi:
name = i.string.replace("\n", "").replace(" ","")
print name
with open("name.txt", "a") as f: # 追加寫入
f.write(name.encode("utf-8")+"\n")
# 抓第二頁后的數據
for i in range(2, ye+1):
r2 = s.get("https://home.cnblogs.com/u/yoyoketang/relation/followers?page=%s"%str(i))
soup = BeautifulSoup(r1.content, "html.parser")
# 抓取我的粉絲數
fensi = soup.find_all(class_="avatar_name")
for i in fensi:
name = i.string.replace("\n", "").replace(" ","")
print name
with open("name.txt", "a") as f: # 追加寫入
f.write(name.encode("utf-8")+"\n")

參考代碼:
# coding:utf-8
import requests
from selenium import webdriver
from bs4 import BeautifulSoup
import re
import time
# firefox瀏覽器配置文件地址
profile_directory = r'C:\Users\admin\AppData\Roaming\Mozilla\Firefox\Profiles\yn80ouvt.default'
s = requests.session() # 新建session
url = "https://home.cnblogs.com/u/yoyoketang"
def get_cookies(url):
'''啟動selenium獲取登錄的cookies'''
try:
# 加載配置
profile = webdriver.FirefoxProfile(profile_directory)
# 啟動瀏覽器配置
driver = webdriver.Firefox(profile)
driver.get(url+"/followers")
time.sleep(3)
cookies = driver.get_cookies() # 獲取瀏覽器cookies
print(cookies)
driver.quit()
return cookies
except Exception as msg:
print(u"啟動瀏覽器報錯了:%s" %str(msg))
def add_cookies(cookies):
'''往session添加cookies'''
try:
# 添加cookies到CookieJar
c = requests.cookies.RequestsCookieJar()
for i in cookies:
c.set(i["name"], i['value'])
s.cookies.update(c) # 更新session里cookies
except Exception as msg:
print(u"添加cookies的時候報錯了:%s" % str(msg))
def get_ye_nub(url):
'''獲取粉絲的頁面數量'''
try:
# 發請求
r1 = s.get(url+"/relation/followers")
soup = BeautifulSoup(r1.content, "html.parser")
# 抓取我的粉絲數
fensinub = soup.find_all(class_="current_nav")
print(fensinub[0].string)
num = re.findall(u"我的粉絲\((.+?)\)", fensinub[0].string)
print(u"我的粉絲數量:%s"%str(num[0]))
# 計算有多少頁,每頁45條
ye = int(int(num[0])/45)+1
print(u"總共分頁數:%s"%str(ye))
return ye
except Exception as msg:
print(u"獲取粉絲頁數報錯了,默認返回數量1 :%s"%str(msg))
return 1
def save_name(nub):
'''抓取頁面的粉絲名稱'''
try:
# 抓取第一頁的數據
if nub <= 1:
url_page = url+"/relation/followers"
else:
url_page = url+"/relation/followers?page=%s" % str(nub)
print(u"正在抓取的頁面:%s" %url_page)
r2 = s.get(url_page, verify=False)
soup = BeautifulSoup(r2.content, "html.parser")
fensi = soup.find_all(class_="avatar_name")
for i in fensi:
name = i.string.replace("\n", "").replace(" ","")
print(name)
with open("name.txt", "a") as f: # 追加寫入
f.write(name.encode("utf-8")+"\n")
# python3的改成下面這兩行
# with open("name.txt", "a", encoding="utf-8") as f: # 追加寫入
# f.write(name+"\n")
except Exception as msg:
print(u"抓取粉絲名稱過程中報錯了 :%s"%str(msg))
if __name__ == "__main__":
cookies = get_cookies(url)
add_cookies(cookies)
n = get_ye_nub(url)
for i in list(range(1, n+1)):
save_name(i)
---------------------------------python接口自動化完整版-------------------------
全書購買地址 https://yuedu.baidu.com/ebook/585ab168302b3169a45177232f60ddccda38e695
作者:上海-悠悠 QQ交流群:588402570
也可以關注下我的個人公眾號:

