爬取目標

1.本次代碼是在python2上運行通過的，python3的最需改2行代碼，用到其它python模塊

selenium 2.53.6 +firefox 44
BeautifulSoup
requests

2.爬取目標網站，我的博客：https://home.cnblogs.com/u/yoyoketang
爬取內容：爬我的博客的所有粉絲的名稱，並保存到txt

3.由於博客園的登錄是需要人機驗證的，所以是無法直接用賬號密碼登錄，需借助selenium登錄

selenium獲取cookies

1.大前提：先手工操作瀏覽器，登錄我的博客，並記住密碼
（保證關掉瀏覽器后，下次打開瀏覽器訪問我的博客時候是登錄狀態）
2.selenium默認啟動瀏覽器是一個空的配置，默認不加載配置緩存文件，這里先得找到對應瀏覽器的配置文件地址，以火狐瀏覽器為例
3.使用driver.get_cookies()方法獲取瀏覽器的cookies

# coding:utf-8
import requests
from selenium import webdriver
from bs4 import BeautifulSoup
import re
import time
# firefox瀏覽器配置文件地址
profile_directory = r'C:\Users\admin\AppData\Roaming\Mozilla\Firefox\Profiles\yn80ouvt.default'
# 加載配置
profile = webdriver.FirefoxProfile(profile_directory)
# 啟動瀏覽器配置
driver = webdriver.Firefox(profile)

driver.get("https://home.cnblogs.com/u/yoyoketang/followers/")

time.sleep(3)
cookies = driver.get_cookies()  # 獲取瀏覽器cookies
print(cookies)
driver.quit()

（注：要是這里腳本啟動瀏覽器后，打開的博客頁面是未登錄的，后面內容都不用看了，先檢查配置文件是不是寫錯了）

requests添加登錄的cookies

1.瀏覽器的cookies獲取到后，接下來用requests去建一個session，在session里添加登錄成功后的cookies

s = requests.session()  # 新建session

# 添加cookies到CookieJar
c = requests.cookies.RequestsCookieJar()
for i in cookies:
    c.set(i["name"], i['value'])

s.cookies.update(c)  # 更新session里cookies

計算粉絲數和分頁總數

1.由於我的粉絲的數據是分頁展示的，這里一次只能請求到45個，所以先獲取粉絲總數，然后計算出總的頁數

# 發請求
r1 = s.get("https://home.cnblogs.com/u/yoyoketang/relation/followers")

soup = BeautifulSoup(r1.content, "html.parser")

# 抓取我的粉絲數
fensinub = soup.find_all(class_="current_nav")
print fensinub[0].string
num = re.findall(u"我的粉絲\((.+?)\)", fensinub[0].string)
print u"我的粉絲數量：%s"%str(num[0])

# 計算有多少頁，每頁45條
ye = int(int(num[0])/45)+1
print u"總共分頁數：%s"%str(ye)

保存粉絲名到txt

# 抓取第一頁的數據
fensi = soup.find_all(class_="avatar_name")
for i in fensi:
    name = i.string.replace("\n", "").replace(" ","")
    print name
    with open("name.txt", "a") as f:  # 追加寫入
        f.write(name.encode("utf-8")+"\n")

# 抓第二頁后的數據
for i in range(2, ye+1):
    r2 = s.get("https://home.cnblogs.com/u/yoyoketang/relation/followers?page=%s"%str(i))
    soup = BeautifulSoup(r1.content, "html.parser")
    # 抓取我的粉絲數
    fensi = soup.find_all(class_="avatar_name")

    for i in fensi:
        name = i.string.replace("\n", "").replace(" ","")
        print name
        with open("name.txt", "a") as f:  # 追加寫入
            f.write(name.encode("utf-8")+"\n")

參考代碼：

# coding:utf-8
import requests
from selenium import webdriver
from bs4 import BeautifulSoup
import re
import time

# firefox瀏覽器配置文件地址
profile_directory = r'C:\Users\admin\AppData\Roaming\Mozilla\Firefox\Profiles\yn80ouvt.default'

s = requests.session()  # 新建session
url = "https://home.cnblogs.com/u/yoyoketang"

def get_cookies(url):
    '''啟動selenium獲取登錄的cookies'''
    try:
        # 加載配置
        profile = webdriver.FirefoxProfile(profile_directory)
        # 啟動瀏覽器配置
        driver = webdriver.Firefox(profile)
        driver.get(url+"/followers")

        time.sleep(3)
        cookies = driver.get_cookies()  # 獲取瀏覽器cookies
        print(cookies)
        driver.quit()
        return cookies
    except Exception as msg:
        print(u"啟動瀏覽器報錯了：%s" %str(msg))
def add_cookies(cookies):
    '''往session添加cookies'''
    try:
        # 添加cookies到CookieJar
        c = requests.cookies.RequestsCookieJar()
        for i in cookies:
            c.set(i["name"], i['value'])

        s.cookies.update(c)  # 更新session里cookies
    except Exception as msg:
        print(u"添加cookies的時候報錯了：%s" % str(msg))

def get_ye_nub(url):
    '''獲取粉絲的頁面數量'''
    try:
        # 發請求
        r1 = s.get(url+"/relation/followers")
        soup = BeautifulSoup(r1.content, "html.parser")
        # 抓取我的粉絲數
        fensinub = soup.find_all(class_="current_nav")
        print(fensinub[0].string)
        num = re.findall(u"我的粉絲\((.+?)\)", fensinub[0].string)
        print(u"我的粉絲數量：%s"%str(num[0]))

        # 計算有多少頁，每頁45條
        ye = int(int(num[0])/45)+1
        print(u"總共分頁數：%s"%str(ye))
        return ye
    except Exception as msg:
        print(u"獲取粉絲頁數報錯了，默認返回數量1 ：%s"%str(msg))
        return 1

def save_name(nub):
    '''抓取頁面的粉絲名稱'''
    try:
        # 抓取第一頁的數據
        if nub <= 1:
            url_page = url+"/relation/followers"
        else:
            url_page = url+"/relation/followers?page=%s" % str(nub)
        print(u"正在抓取的頁面：%s" %url_page)
        r2 = s.get(url_page, verify=False)
        soup = BeautifulSoup(r2.content, "html.parser")
        fensi = soup.find_all(class_="avatar_name")
        for i in fensi:
            name = i.string.replace("\n", "").replace(" ","")
            print(name)
            with open("name.txt", "a") as f:  # 追加寫入
                f.write(name.encode("utf-8")+"\n")

            # python3的改成下面這兩行
            # with open("name.txt", "a", encoding="utf-8") as f:  # 追加寫入
            #     f.write(name+"\n")
    
 
    except Exception as msg:
        print(u"抓取粉絲名稱過程中報錯了 ：%s"%str(msg))

if __name__ == "__main__":
    cookies = get_cookies(url)
    add_cookies(cookies)
    n = get_ye_nub(url)
    for i in list(range(1, n+1)):
        save_name(i)

---------------------------------python接口自動化完整版-------------------------

全書購買地址 https://yuedu.baidu.com/ebook/585ab168302b3169a45177232f60ddccda38e695

作者：上海-悠悠 QQ交流群：588402570

也可以關注下我的個人公眾號：

免責聲明！

本站轉載的文章為個人學習借鑒使用，本站對版權不負任何法律責任。如果侵犯了您的隱私權益，請聯系本站郵箱yoyou2525@163.com刪除。

猜您在找 【Python】 requests 爬取博客園內容AttributeError: 'NoneType' object has no attribute 'xpath' python requests庫爬取視頻 Python爬取CSDN博客文章 python的requests模塊爬取網頁內容 python3爬蟲-通過requests爬取西刺代理 python爬蟲---實現項目(一) Requests爬取HTML信息 16-python爬蟲之Requests庫爬取海量圖片 python3 requests爬取gbk時候遇到編碼的坑 python+requests爬取百度文庫ppt Python：requests 爬取40天天氣預報