(一)選題背景
當今世界電子小說閱讀走進千家萬戶,其中各大網絡電子小說網站更是琳琅滿目,為了 探尋網站簽約作家的各小說作品的熱度對比。
我選擇了縱橫小說網的無罪作家來作為我本學期爬蟲程序設計作業。
(二)主題式網絡爬蟲設計方案
1.主題式網絡爬蟲名稱:
對縱橫小說網簽約作家作品數據進行爬取與分析
2.主題式網絡爬蟲爬取的內容與數據待征分析
http://home.zongheng.com/show/userInfo/110992.html
爬取縱橫小說網中無罪作家的各本小說的書名和點擊量
3.主題式網絡爬蟲設計方案概述(包括實現思路與技術難點)
從源代碼頁面爬取書籍url地址:
爬取書籍名稱和點擊量:
然后把名稱和點擊量等數據制成折線圖,直方圖,餅圖。
(三)主題頁面的結構特征分析
1..主題頁面的結構與特征分析

2.Htmls 頁面解析

(四)網絡爬蟲程序設計
數據爬取與采集和清洗
- #需要爬取的網站的url
- url = 'http://www.zongheng.com/'
- #獲取頁面信息
- def gethtml(url):'http://home.zongheng.com/show/userInfo/110992.htm'
- info = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.75 Safari/537.36'}
- try:
- data = requests.get(url, headers=info)
- data.raise_for_status()
- data.encoding = data.apparent_encoding
- return data.text
- except:
- return " "
- #書籍url
- def urlinfo(url):
- books = []
- book = gethtml(url)
- soup = BeautifulSoup(book, "html.parser")
- #獲取屬性為tit的p標簽
- p = soup.find_all("p", attrs="tit")
- for item in p:
- #獲取書籍地址
- books.append(item.a.attrs['href'])
- return books
- #點擊量信息
- def numsinfo(html):
- n = []
- soup = BeautifulSoup(html, 'html.parser')
- div = soup.find_all("div", attrs='nums')
- nums = div[0]
- i = 0
- for spa in nums.find_all("i"):
- if i == 2:
- #獲取點擊量
- n.append(spa.string.split('.')[0])
- break
- i += 1
- return n
- #書名信息
- def namesinfo(html):
- soup = BeautifulSoup(html, 'html.parser')
- #獲取屬性為book-name的div
- name = soup.find_all("div", attrs='book-name')
- #正則獲取中文書名
- namess = re.findall(r"[\u4e00-\u9fa5]+", str(name[0]))
- return namess
文本分析

- def file(book, nums, address):
- # 創建Excel
- excel = xlwt.Workbook(encoding='utf-8')
- #創建表
- sheet1 = excel.add_sheet(u'One', cell_overwrite_ok=True)
- #寫入列名
- sheet1.write(0, 0, 'book')
- sheet1.write(0, 1, 'number')
- for i in range(1, len(book)):
- sheet1.write(i, 0, book[i])
- for j in range(1, len(number)):
- sheet1.write(j, 1, number[j])
- excel.save(address)
數據分析與可視化

#柱形圖
from matplotlib import pyplot as plt
plt.rcParams['font.sans-serif'] = ['SimHei']
x = ['渡劫之王', '巴山劍揚', '平天策', '神仙職員', '劍王朝', '流氓高手', '仙俠世界', '仙魔變',
'羅浮', '冰火破壞神', '眾神王座']
y = [3255112,400640,3062812,913820,2325362,1113306,2723772,2807917,2436869,2224430,1007224]
plt.bar(x, y)
plt.title('縱橫小說網無罪')
plt.xlabel('作品')
plt.ylabel('點擊量')
plt.xticks(rotation=45)
plt.show()
將以上各部分的代碼匯總,附上完整程序代碼
- #導入相關庫
- from bs4 import BeautifulSoup
- import requests
- import matplotlib
- import re
- import xlwt
- import matplotlib.pyplot as plt
- #需要爬取的網站的url
- url = 'http://www.zongheng.com/'
- #獲取頁面信息
- def gethtml(url):'http://home.zongheng.com/show/userInfo/110992.htm'
- info = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.75 Safari/537.36'}
- try:
- data = requests.get(url, headers=info)
- data.raise_for_status()
- data.encoding = data.apparent_encoding
- return data.text
- except:
- return " "
- #書籍url
- def urlinfo(url):
- books = []
- book = gethtml(url)
- soup = BeautifulSoup(book, "html.parser")
- #獲取屬性為tit的p標簽
- p = soup.find_all("p", attrs="tit")
- for item in p:
- #獲取書籍地址
- books.append(item.a.attrs['href'])
- return books
- #點擊量信息
- def numsinfo(html):
- n = []
- soup = BeautifulSoup(html, 'html.parser')
- div = soup.find_all("div", attrs='nums')
- nums = div[0]
- i = 0
- for spa in nums.find_all("i"):
- if i == 2:
- #獲取點擊量
- n.append(spa.string.split('.')[0])
- break
- i += 1
- return n
- #書名信息
- def namesinfo(html):
- soup = BeautifulSoup(html, 'html.parser')
- #獲取屬性為book-name的div
- name = soup.find_all("div", attrs='book-name')
- #正則獲取中文書名
- namess = re.findall(r"[\u4e00-\u9fa5]+", str(name[0]))
- return namess
- #修復中文方框
- matplotlib.rcParams['font.sans-serif'] = ['SimHei']
- matplotlib.rcParams['font.family'] = 'sans-serif'
- matplotlib.rcParams['axes.unicode_minus'] = False
- #柱形圖
- from matplotlib import pyplot as plt
- plt.rcParams['font.sans-serif'] = ['SimHei']
- x = ['渡劫之王', '巴山劍揚', '平天策', '神仙職員', '劍王朝', '流氓高手', '仙俠世界', '仙魔變',
- '羅浮', '冰火破壞神', '眾神王座']
- y = [3255112,400640,3062812,913820,2325362,1113306,2723772,2807917,2436869,2224430,1007224]
- plt.bar(x, y)
- plt.title('縱橫小說網無罪')
- plt.xlabel('作品')
- plt.ylabel('點擊量')
- plt.xticks(rotation=45)
- plt.show()
- def file(book, nums, address):
- # 創建Excel
- excel = xlwt.Workbook(encoding='utf-8')
- #創建表
- sheet1 = excel.add_sheet(u'One', cell_overwrite_ok=True)
- #寫入列名
- sheet1.write(0, 0, 'book')
- sheet1.write(0, 1, 'number')
- for i in range(1, len(book)):
- sheet1.write(i, 0, book[i])
- for j in range(1, len(number)):
- sheet1.write(j, 1, number[j])
- excel.save(address)
- #列表元素類型轉換
- def convert(lista):
- listb = []
- for i in lista:
- listb.append(i[0])
- return listb
- def main():
- #作者頁面
- author = 'http://home.zongheng.com/show/userInfo/110992.html'
- user = '無罪'
- urls = urlinfo(author)
- namelist = []
- countlist = []
- for url in urls:
- html = gethtml(url)
- namelist.append(namesinfo(html))
- countlist.append(numsinfo(html))
- namelist = convert(namelist)
- countlist = convert(countlist)
- for i in range(len(countlist)):
- countlist[i] = int(countlist[i])
- #保存地址
- addr = f'D:\\{xxj}.xls'
- file(namelist, countlist, addr)
- Bar(namelist, countlist, user)
- if __name__ == '__main__':
- main()
五、總結
通過這次對python爬蟲的實戰運用我深刻認識到了自己學習上的不足,期間查閱借鑒了很多學長學姐的資料,從頭開始一步步慢慢摸索最終完成了這次的課程設計。
期間在matplotliib繪制圖形的學習上下了很大功夫,對網站的頁面解析也是非常棘手,但本次學習讓我深刻認識到python工具的強大,讓我對編程語言產生了興趣,希
望自己的在接下來的學習中可以進步,在爬蟲方面的能力提升。
