Python爬蟲實現統計博客園博文數量、閱讀量、評論數


 

如何使用

只需要將代碼中的headurl替換以下格式,其中你只需要改變以下鏈接的陰影部分,陰影部分為你的博客園鏈接。

類似:
https://www.cnblogs.com/-wenli/default.html?page=

 

 

原理

使用requests爬取網頁,再使用BeautifulSoup解析網頁,獲取數據、對數據做了預處理,最后使用正則匹配,匹配出需要的數據。

最后的數據使用一個大字典存儲。

爬取網頁

爬取網頁這里做了異常處理。

def get_one_page(url,headers):
    try:
        response = requests.get(url,headers=headers)
        if response.status_code ==200:
            return response.text
    except RequestException:
        return None

解析網頁

def parse_one_page(html):
    global item,flag
    soup = BeautifulSoup(html, 'lxml')
    divs = soup.find_all('div',class_='day')
    if(len(divs) == 0):
        flag = False
        return ""
    for i, child in enumerate(divs):
        list = []
        i=0
        titles = child.find_all('div',class_='postTitle')
        infomations = child.find_all('div', class_='postDesc')
        #
        for title in titles:
            partitle = title.find_all('a',class_='postTitle2')
            partitleinfo = partitle[0].get_text()
            partitleinfo = partitleinfo.replace('\n', '')
            partitleinfo = partitleinfo.replace(' ', '')
            list.append(partitleinfo)
        for infomation in infomations:
            dic = {}
            info = infomation.get_text() #獲得文本
            info = info.replace('\n', '') #去掉換行
            info = info.replace(' ', '') #去掉空白字符
            result = re.match('^.*閱讀.(\d+)..*評論.(\d+)..*編輯$', info)
            #print(result.group(1),result.group(2))
            dic["閱讀量"] = result.group(1)
            dic["評論量"] = result.group(2)
            item[list[i]] = dic
            i+=1

統計數據

def statistics():
    global item
    readtotal = 0
    commandtotal = 0
    blogtotal = 0
    for v in item.values():
        readtotal = readtotal + int(v['閱讀量'])
        commandtotal = commandtotal +  int(v['評論量'])
        blogtotal += 1
    print('總博文量:', blogtotal)
    print('總閱讀量:',readtotal)
    print('總評論量:', commandtotal)

源碼

from bs4 import BeautifulSoup
import requests
from requests.exceptions import RequestException
import re
import time

def get_one_page(url,headers):
    try:
        response = requests.get(url,headers=headers)
        if response.status_code ==200:
            return response.text
    except RequestException:
        return None
def parse_one_page(html):
    global item,flag
    soup = BeautifulSoup(html, 'lxml')
    divs = soup.find_all('div',class_='day')
    if(len(divs) == 0):
        flag = False
        return ""
    for i, child in enumerate(divs):
        list = []
        i=0
        titles = child.find_all('div',class_='postTitle')
        infomations = child.find_all('div', class_='postDesc')
        #
        for title in titles:
            partitle = title.find_all('a',class_='postTitle2')
            partitleinfo = partitle[0].get_text()
            partitleinfo = partitleinfo.replace('\n', '')
            partitleinfo = partitleinfo.replace(' ', '')
            list.append(partitleinfo)
        for infomation in infomations:
            dic = {}
            info = infomation.get_text() #獲得文本
            info = info.replace('\n', '') #去掉換行
            info = info.replace(' ', '') #去掉空白字符
            result = re.match('^.*閱讀.(\d+)..*評論.(\d+)..*編輯$', info)
            #print(result.group(1),result.group(2))
            dic["閱讀量"] = result.group(1)
            dic["評論量"] = result.group(2)
            item[list[i]] = dic
            i+=1
def statistics():
    global item
    readtotal = 0
    commandtotal = 0
    blogtotal = 0
    for v in item.values():
        readtotal = readtotal + int(v['閱讀量'])
        commandtotal = commandtotal +  int(v['評論量'])
        blogtotal += 1
    print('總博文量:', blogtotal)
    print('總閱讀量:',readtotal)
    print('總評論量:', commandtotal)

def kind():
    pass

def main():
    headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko'}
    headurl = 'https://www.cnblogs.com/-wenli/default.html?page='
    i = 1
    while flag:
        url = headurl + str(i)
        print(url)
        #獲取源碼
        html = get_one_page(url,headers)
        #解析源碼
        parse_one_page(html)
        i += 1

    #統計功能
    #print(item)
    statistics()


if __name__ == '__main__':
    flag = True
    item = {}
    main()

 演示結果:

 


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM