（一）選題背景

當今世界電子小說閱讀走進千家萬戶，其中各大網絡電子小說網站更是琳琅滿目，為了探尋網站簽約作家的各小說作品的熱度對比。

我選擇了縱橫小說網的無罪作家來作為我本學期爬蟲程序設計作業。

（二）主題式網絡爬蟲設計方案

1.主題式網絡爬蟲名稱：

對縱橫小說網簽約作家作品數據進行爬取與分析

2.主題式網絡爬蟲爬取的內容與數據待征分析

http://home.zongheng.com/show/userInfo/110992.html

http://www.zongheng.com/

爬取縱橫小說網中無罪作家的各本小說的書名和點擊量

3.主題式網絡爬蟲設計方案概述（包括實現思路與技術難點）

從源代碼頁面爬取書籍url地址：

爬取書籍名稱和點擊量：

然后把名稱和點擊量等數據制成折線圖，直方圖，餅圖。

（三）主題頁面的結構特征分析

1..主題頁面的結構與特征分析

2.Htmls 頁面解析

(四）網絡爬蟲程序設計

數據爬取與采集和清洗

#需要爬取的網站的url
url = 'http://www.zongheng.com/'
#獲取頁面信息
def gethtml(url):'http://home.zongheng.com/show/userInfo/110992.htm'
info = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.75 Safari/537.36'}
try:
data = requests.get(url, headers=info)
data.raise_for_status()
data.encoding = data.apparent_encoding
return data.text
except:
return " "

#書籍url
def urlinfo(url):
books = []
book = gethtml(url)
soup = BeautifulSoup(book, "html.parser")
#獲取屬性為tit的p標簽
p = soup.find_all("p", attrs="tit")
for item in p:
#獲取書籍地址
books.append(item.a.attrs['href'])
return books

#點擊量信息
def numsinfo(html):
n = []
soup = BeautifulSoup(html, 'html.parser')
div = soup.find_all("div", attrs='nums')
nums = div[0]
i = 0
for spa in nums.find_all("i"):
if i == 2:
#獲取點擊量
n.append(spa.string.split('.')[0])
break
i += 1
return n

#書名信息
def namesinfo(html):
soup = BeautifulSoup(html, 'html.parser')
#獲取屬性為book-name的div
name = soup.find_all("div", attrs='book-name')
#正則獲取中文書名
namess = re.findall(r"[\u4e00-\u9fa5]+", str(name[0]))
return namess

文本分析

def file(book, nums, address):
# 創建Excel
excel = xlwt.Workbook(encoding='utf-8')
#創建表
sheet1 = excel.add_sheet(u'One', cell_overwrite_ok=True)
#寫入列名
sheet1.write(0, 0, 'book')
sheet1.write(0, 1, 'number')
for i in range(1, len(book)):
sheet1.write(i, 0, book[i])
for j in range(1, len(number)):
sheet1.write(j, 1, number[j])
excel.save(address)

數據分析與可視化

#柱形圖

from matplotlib import pyplot as plt

plt.rcParams['font.sans-serif'] = ['SimHei']
x = ['渡劫之王', '巴山劍揚', '平天策', '神仙職員', '劍王朝', '流氓高手', '仙俠世界', '仙魔變',
'羅浮', '冰火破壞神', '眾神王座']
y = [3255112,400640,3062812,913820,2325362,1113306,2723772,2807917,2436869,2224430,1007224]

plt.bar(x, y)
plt.title('縱橫小說網無罪')
plt.xlabel('作品')
plt.ylabel('點擊量')
plt.xticks(rotation=45)
plt.show()

將以上各部分的代碼匯總，附上完整程序代碼

#導入相關庫
from bs4 import BeautifulSoup
import requests
import matplotlib
import re
import xlwt
import matplotlib.pyplot as plt
#需要爬取的網站的url
url = 'http://www.zongheng.com/'
#獲取頁面信息
def gethtml(url):'http://home.zongheng.com/show/userInfo/110992.htm'
info = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.75 Safari/537.36'}
try:
data = requests.get(url, headers=info)
data.raise_for_status()
data.encoding = data.apparent_encoding
return data.text
except:
return " "
#書籍url
def urlinfo(url):
books = []
book = gethtml(url)
soup = BeautifulSoup(book, "html.parser")
#獲取屬性為tit的p標簽
p = soup.find_all("p", attrs="tit")
for item in p:
#獲取書籍地址
books.append(item.a.attrs['href'])
return books
#點擊量信息
def numsinfo(html):
n = []
soup = BeautifulSoup(html, 'html.parser')
div = soup.find_all("div", attrs='nums')
nums = div[0]
i = 0
for spa in nums.find_all("i"):
if i == 2:
#獲取點擊量
n.append(spa.string.split('.')[0])
break
i += 1
return n
#書名信息
def namesinfo(html):
soup = BeautifulSoup(html, 'html.parser')
#獲取屬性為book-name的div
name = soup.find_all("div", attrs='book-name')
#正則獲取中文書名
namess = re.findall(r"[\u4e00-\u9fa5]+", str(name[0]))
return namess
#修復中文方框
matplotlib.rcParams['font.sans-serif'] = ['SimHei']
matplotlib.rcParams['font.family'] = 'sans-serif'
matplotlib.rcParams['axes.unicode_minus'] = False
#柱形圖
from matplotlib import pyplot as plt
plt.rcParams['font.sans-serif'] = ['SimHei']
x = ['渡劫之王', '巴山劍揚', '平天策', '神仙職員', '劍王朝', '流氓高手', '仙俠世界', '仙魔變',
'羅浮', '冰火破壞神', '眾神王座']
y = [3255112,400640,3062812,913820,2325362,1113306,2723772,2807917,2436869,2224430,1007224]
plt.bar(x, y)
plt.title('縱橫小說網無罪')
plt.xlabel('作品')
plt.ylabel('點擊量')
plt.xticks(rotation=45)
plt.show()
def file(book, nums, address):
# 創建Excel
excel = xlwt.Workbook(encoding='utf-8')
#創建表
sheet1 = excel.add_sheet(u'One', cell_overwrite_ok=True)
#寫入列名
sheet1.write(0, 0, 'book')
sheet1.write(0, 1, 'number')
for i in range(1, len(book)):
sheet1.write(i, 0, book[i])
for j in range(1, len(number)):
sheet1.write(j, 1, number[j])
excel.save(address)
#列表元素類型轉換
def convert(lista):
listb = []
for i in lista:
listb.append(i[0])
return listb
def main():
#作者頁面
author = 'http://home.zongheng.com/show/userInfo/110992.html'
user = '無罪'
urls = urlinfo(author)
namelist = []
countlist = []
for url in urls:
html = gethtml(url)
namelist.append(namesinfo(html))
countlist.append(numsinfo(html))
namelist = convert(namelist)
countlist = convert(countlist)
for i in range(len(countlist)):
countlist[i] = int(countlist[i])
#保存地址
addr = f'D:\\{xxj}.xls'
file(namelist, countlist, addr)
Bar(namelist, countlist, user)
if __name__ == '__main__':
main()

五、總結

通過這次對python爬蟲的實戰運用我深刻認識到了自己學習上的不足，期間查閱借鑒了很多學長學姐的資料，從頭開始一步步慢慢摸索最終完成了這次的課程設計。

期間在matplotliib繪制圖形的學習上下了很大功夫，對網站的頁面解析也是非常棘手，但本次學習讓我深刻認識到python工具的強大，讓我對編程語言產生了興趣，希

望自己的在接下來的學習中可以進步，在爬蟲方面的能力提升。

免責聲明！

本站轉載的文章為個人學習借鑒使用，本站對版權不負任何法律責任。如果侵犯了您的隱私權益，請聯系本站郵箱yoyou2525@163.com刪除。

猜您在找 Python3中BeautifulSoup爬取筆趣閣小說網 python，爬取小說網站小說內容，同時每一章存在不同的txt文件中 Python爬取全書網小說，免費看小說【爬蟲】對新筆趣閣小說進行爬取，保存和下載數據的爬取和分析爬取分析拉勾網招聘信息 python爬取小說詳解（一）爬取千千小說 -- xpath 爬取詩詞名句網數據並做簡單數據分析用Python爬取拉勾網數據分析職位及數據可視化