如何使用
只需要將代碼中的headurl替換以下格式,其中你只需要改變以下鏈接的陰影部分,陰影部分為你的博客園鏈接。
類似: https://www.cnblogs.com/-wenli/default.html?page=
原理
使用requests爬取網頁,再使用BeautifulSoup解析網頁,獲取數據、對數據做了預處理,最后使用正則匹配,匹配出需要的數據。
最后的數據使用一個大字典存儲。
爬取網頁
爬取網頁這里做了異常處理。
def get_one_page(url,headers): try: response = requests.get(url,headers=headers) if response.status_code ==200: return response.text except RequestException: return None
解析網頁
def parse_one_page(html): global item,flag soup = BeautifulSoup(html, 'lxml') divs = soup.find_all('div',class_='day') if(len(divs) == 0): flag = False return "" for i, child in enumerate(divs): list = [] i=0 titles = child.find_all('div',class_='postTitle') infomations = child.find_all('div', class_='postDesc') # for title in titles: partitle = title.find_all('a',class_='postTitle2') partitleinfo = partitle[0].get_text() partitleinfo = partitleinfo.replace('\n', '') partitleinfo = partitleinfo.replace(' ', '') list.append(partitleinfo) for infomation in infomations: dic = {} info = infomation.get_text() #獲得文本 info = info.replace('\n', '') #去掉換行 info = info.replace(' ', '') #去掉空白字符 result = re.match('^.*閱讀.(\d+)..*評論.(\d+)..*編輯$', info) #print(result.group(1),result.group(2)) dic["閱讀量"] = result.group(1) dic["評論量"] = result.group(2) item[list[i]] = dic i+=1
統計數據
def statistics(): global item readtotal = 0 commandtotal = 0 blogtotal = 0 for v in item.values(): readtotal = readtotal + int(v['閱讀量']) commandtotal = commandtotal + int(v['評論量']) blogtotal += 1 print('總博文量:', blogtotal) print('總閱讀量:',readtotal) print('總評論量:', commandtotal)
源碼
from bs4 import BeautifulSoup import requests from requests.exceptions import RequestException import re import time def get_one_page(url,headers): try: response = requests.get(url,headers=headers) if response.status_code ==200: return response.text except RequestException: return None def parse_one_page(html): global item,flag soup = BeautifulSoup(html, 'lxml') divs = soup.find_all('div',class_='day') if(len(divs) == 0): flag = False return "" for i, child in enumerate(divs): list = [] i=0 titles = child.find_all('div',class_='postTitle') infomations = child.find_all('div', class_='postDesc') # for title in titles: partitle = title.find_all('a',class_='postTitle2') partitleinfo = partitle[0].get_text() partitleinfo = partitleinfo.replace('\n', '') partitleinfo = partitleinfo.replace(' ', '') list.append(partitleinfo) for infomation in infomations: dic = {} info = infomation.get_text() #獲得文本 info = info.replace('\n', '') #去掉換行 info = info.replace(' ', '') #去掉空白字符 result = re.match('^.*閱讀.(\d+)..*評論.(\d+)..*編輯$', info) #print(result.group(1),result.group(2)) dic["閱讀量"] = result.group(1) dic["評論量"] = result.group(2) item[list[i]] = dic i+=1 def statistics(): global item readtotal = 0 commandtotal = 0 blogtotal = 0 for v in item.values(): readtotal = readtotal + int(v['閱讀量']) commandtotal = commandtotal + int(v['評論量']) blogtotal += 1 print('總博文量:', blogtotal) print('總閱讀量:',readtotal) print('總評論量:', commandtotal) def kind(): pass def main(): headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko'} headurl = 'https://www.cnblogs.com/-wenli/default.html?page=' i = 1 while flag: url = headurl + str(i) print(url) #獲取源碼 html = get_one_page(url,headers) #解析源碼 parse_one_page(html) i += 1 #統計功能 #print(item) statistics() if __name__ == '__main__': flag = True item = {} main()
演示結果: