Python爬蟲實現統計博客園博文數量、閱讀量、評論數

本文轉載自查看原文 2020-03-11 18:39 1015 Python Crawle

如何使用

只需要將代碼中的headurl替換以下格式，其中你只需要改變以下鏈接的陰影部分，陰影部分為你的博客園鏈接。

類似：
https://www.cnblogs.com/-wenli/default.html?page=

原理

使用requests爬取網頁，再使用BeautifulSoup解析網頁，獲取數據、對數據做了預處理，最后使用正則匹配，匹配出需要的數據。

最后的數據使用一個大字典存儲。

爬取網頁

爬取網頁這里做了異常處理。

def get_one_page(url,headers):
    try:
        response = requests.get(url,headers=headers)
        if response.status_code ==200:
            return response.text
    except RequestException:
        return None

解析網頁

def parse_one_page(html):
    global item,flag
    soup = BeautifulSoup(html, 'lxml')
    divs = soup.find_all('div',class_='day')
    if(len(divs) == 0):
        flag = False
        return ""
    for i, child in enumerate(divs):
        list = []
        i=0
        titles = child.find_all('div',class_='postTitle')
        infomations = child.find_all('div', class_='postDesc')
        #
        for title in titles:
            partitle = title.find_all('a',class_='postTitle2')
            partitleinfo = partitle[0].get_text()
            partitleinfo = partitleinfo.replace('\n', '')
            partitleinfo = partitleinfo.replace(' ', '')
            list.append(partitleinfo)
        for infomation in infomations:
            dic = {}
            info = infomation.get_text() #獲得文本
            info = info.replace('\n', '') #去掉換行
            info = info.replace(' ', '') #去掉空白字符
            result = re.match('^.*閱讀.(\d+)..*評論.(\d+)..*編輯$', info)
            #print(result.group(1),result.group(2))
            dic["閱讀量"] = result.group(1)
            dic["評論量"] = result.group(2)
            item[list[i]] = dic
            i+=1

統計數據

def statistics():
    global item
    readtotal = 0
    commandtotal = 0
    blogtotal = 0
    for v in item.values():
        readtotal = readtotal + int(v['閱讀量'])
        commandtotal = commandtotal +  int(v['評論量'])
        blogtotal += 1
    print('總博文量：', blogtotal)
    print('總閱讀量：',readtotal)
    print('總評論量：', commandtotal)

源碼

from bs4 import BeautifulSoup
import requests
from requests.exceptions import RequestException
import re
import time

def get_one_page(url,headers):
    try:
        response = requests.get(url,headers=headers)
        if response.status_code ==200:
            return response.text
    except RequestException:
        return None
def parse_one_page(html):
    global item,flag
    soup = BeautifulSoup(html, 'lxml')
    divs = soup.find_all('div',class_='day')
    if(len(divs) == 0):
        flag = False
        return ""
    for i, child in enumerate(divs):
        list = []
        i=0
        titles = child.find_all('div',class_='postTitle')
        infomations = child.find_all('div', class_='postDesc')
        #
        for title in titles:
            partitle = title.find_all('a',class_='postTitle2')
            partitleinfo = partitle[0].get_text()
            partitleinfo = partitleinfo.replace('\n', '')
            partitleinfo = partitleinfo.replace(' ', '')
            list.append(partitleinfo)
        for infomation in infomations:
            dic = {}
            info = infomation.get_text() #獲得文本
            info = info.replace('\n', '') #去掉換行
            info = info.replace(' ', '') #去掉空白字符
            result = re.match('^.*閱讀.(\d+)..*評論.(\d+)..*編輯$', info)
            #print(result.group(1),result.group(2))
            dic["閱讀量"] = result.group(1)
            dic["評論量"] = result.group(2)
            item[list[i]] = dic
            i+=1
def statistics():
    global item
    readtotal = 0
    commandtotal = 0
    blogtotal = 0
    for v in item.values():
        readtotal = readtotal + int(v['閱讀量'])
        commandtotal = commandtotal +  int(v['評論量'])
        blogtotal += 1
    print('總博文量：', blogtotal)
    print('總閱讀量：',readtotal)
    print('總評論量：', commandtotal)

def kind():
    pass

def main():
    headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko'}
    headurl = 'https://www.cnblogs.com/-wenli/default.html?page='
    i = 1
    while flag:
        url = headurl + str(i)
        print(url)
        #獲取源碼
        html = get_one_page(url,headers)
        #解析源碼
        parse_one_page(html)
        i += 1

    #統計功能
    #print(item)
    statistics()


if __name__ == '__main__':
    flag = True
    item = {}
    main()

演示結果:

免責聲明！

本站轉載的文章為個人學習借鑒使用，本站對版權不負任何法律責任。如果侵犯了您的隱私權益，請聯系本站郵箱yoyou2525@163.com刪除。

猜您在找 Web Worker——js的多線程，實現統計博客園總閱讀量、總評論量、總推薦量 Python 爬蟲入門——小項目實戰（自動私信博客園某篇博客下的評論人，隨機發送一條笑話，完整代碼在博文最后） python——關於簡單爬取博客園班級成員發的博文的題目、發布人、閱讀、評論，再存到csv文件中博客園博文爬蟲案例效果 .net core 實現簡單爬蟲—抓取博客園的博文列表如何刷博客園閱讀量網絡爬蟲+HtmlAgilityPack+windows服務從博客園爬取20萬博文博客園-博文自動發布工具如何統計博客園的個人博客訪問量如何統計博客園的個人博客訪問量