一、選題背景
小說這類文化產物可以說從小陪伴我們長大,還記得晚上高中在宿舍看小說第二天打哈欠。如何了解一本新出小說的熱度呢?可以從一些小說平台看排行數據,就可以看出小說的熱度如何了。為此我選此題,是進行小說的周熱度分析,以及從目前更新字數總量可以判斷出小說更新了多少。
二、網絡爬蟲設計方案
名稱:飛盧小說周閱讀熱度數據爬取
內容:通過爬蟲的三段式進行爬取數據,最后通過sys來保存數據。
思路:首先request請求網頁,用etree進行網頁解析。然后使用etree.xpath進行數據篩選。用for循環進行網頁的翻頁,最后sys庫進行數據操作與保存。
難點:網站的翻頁設置,以及數據的篩出。
三、結構特征分析
結構:內容導航型
Htmls頁面解析:
小說名:
分類:
周點擊、字數量:
簡介:
節點查找、遍歷:
查找:通過xpath找到標簽位置。
遍歷:使用for循環+計數方案進行遍歷出數據。
四、網絡爬蟲程序設計
數據爬取與采集:
1 import requests 2 from bs4 import BeautifulSoup 3 import time 4 import random 5 import sys 6 import re 7 from tqdm import tqdm 8 from lxml import etree 9 10 11 USER_AGENTS = [ 12 'Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1667.0 Safari/537.36' 13 'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.16 Safari/537.36' 14 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1623.0 Safari/537.36' 15 'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.17 Safari/537.36' 16 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.62 Safari/537.36' 17 'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.2 Safari/537.36' 18 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1468.0 Safari/537.36' 19 'Mozilla/5.0 (Windows NT 6.2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1467.0 Safari/537.36' 20 'Mozilla/5.0 (Windows NT 6.2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1464.0 Safari/537.36' 21 'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1500.55 Safari/537.36' 22 'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36' 23 'Mozilla/5.0 (Windows NT 6.1; rv:27.3) Gecko/20130101 Firefox/27.3' 24 'Mozilla/5.0 (Windows NT 6.2; Win64; x64; rv:27.0) Gecko/20121011 Firefox/27.0' 25 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:25.0) Gecko/20100101 Firefox/25.0' 26 'Mozilla/5.0 (Windows NT 6.0; WOW64; rv:24.0) Gecko/20100101 Firefox/24.0' 27 'Mozilla/5.0 (Windows NT 6.2; rv:22.0) Gecko/20130405 Firefox/23.0' 28 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20130406 Firefox/23.0' 29 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:23.0) Gecko/20131011 Firefox/23.0' 30 'Mozilla/5.0 (Windows NT 6.2; rv:22.0) Gecko/20130405 Firefox/22.0' 31 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:22.0) Gecko/20130328 Firefox/22.0' 32 'Mozilla/5.0 (Windows NT 6.1; rv:22.0) Gecko/20130405 Firefox/22.0' 33 'Mozilla/5.0 (Microsoft Windows NT 6.2.9200.0); rv:22.0) Gecko/20130405 Firefox/22.0' 34 'Mozilla/5.0 (Windows NT 6.2; Win64; x64; rv:16.0.1) Gecko/20121011 Firefox/21.0.1' 35 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:16.0.1) Gecko/20121011 Firefox/21.0.1' 36 'Mozilla/5.0 (Windows NT 6.2; Win64; x64; rv:21.0.0) Gecko/20121011 Firefox/21.0.0' 37 'Mozilla/5.0 (Windows NT 6.2; WOW64; rv:21.0) Gecko/20130514 Firefox/21.0' 38 ] 39 40 headers = { 41 'User-Agent':random.choice(USER_AGENTS), 42 # 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0', 43 'Connection':'keep-alive', 44 'Accept-Language':'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2' 45 } 46 47 def Fl(page): 48 # 創建文件 49 file = open("Fl_pop.csv", "a") 50 file.write( "book_name" + "," + "book_class" + "," + "book_hits" + "," + "book_word" + "," + "book_info" + '\n') 51 file = file.close() 52 for i in range(0,page): 53 url = 'https://b.faloo.com/l_0_0_0_0_0_1_'+str(page)+'.html' 54 res = requests.get(url,headers=headers) 55 res.encoding = 'gb2312' 56 html = etree.HTML(res.text) 57 # print(html) 58 # print(res) 59 # 計數初始化 60 coun1 = 1 61 coun2 = 1 62 #書名book_name、書分類book_class、書點擊數book_hits、書字數book_word、書簡介book_info 63 for i in range(1,16): 64 try: 65 book_name= html.xpath("//*[@id='BookContent']/div[{}]/div[{}]/div[2]/div[1]/div[1]/h1/a/text()".format(coun1,coun2)) 66 for i in book_name: 67 book_name = i 68 book_class = html.xpath("//*[@id='BookContent']/div[{}]/div[{}]/div[2]/div[2]/span/a/text()".format(coun1,coun2)) 69 for i in book_class: 70 book_class = i 71 book_hits = html.xpath("//*[@id='BookContent']/div[{}]/div[{}]/div[2]/div[2]/span/span[2]/text()".format(coun1,coun2)) 72 for i in book_hits: 73 book_hits = i.strip('周點擊:') 74 book_hits = book_hits.strip('萬') 75 book_word = html.xpath("//*[@id='BookContent']/div[{}]/div[{}]/div[2]/div[2]/span/span[4]/text()".format(coun1,coun2)) 76 for i in book_word: 77 book_word = i.strip('字數:') 78 book_word = book_word.strip('萬') 79 book_info = html.xpath("//*[@id='BookContent']/div[{}]/div[{}]/div[2]/div[3]/a/text()".format(coun1,coun2)) 80 for i in book_info: 81 book_info = i 82 # 保存文件 83 with open('Fl_pop.csv', "a", encoding='utf-8') as file1: 84 file1.writelines(book_name + "," + book_class + "," + book_hits + "," + book_word + "," + book_info + '\n') 85 # 計數處理 86 coun1 += 1 87 coun2 += 1 88 if coun2>2: 89 coun2 = 1 90 print(book_name,'\n',book_class,'\n','周點擊數:',book_hits,'萬','\n','字數:',book_word,'萬','\n','簡介:\n',book_info,'\n') 91 except: 92 pass 93 page+=1 94 95 if __name__ == '__main__': 96 # 一頁30張數據樣本 97 # page = input("爬取幾頁:") 98 page =20 99 Fl(page)
運行截圖:
數據清洗和處理:
import pandas as pd import numpy as np Fl = pd.read_csv(r'C:\Users\LP\Desktop\LP\Fl_pop.csv',error_bad_lines=False) Fl.head(20)
# 重復值處理 Fl = Fl.drop_duplicates() # Nan處理 Fl = Fl.dropna(axis = 0) # 刪除無效行 del Fl['book_info']
import matplotlib.pyplot as plt # 可視化分析 # y的點擊數單位為萬 x = Fl['book_name'].head(20) y = Fl['book_hits'].head(20) z = Fl['book_word'].head(20) plt.rcParams['font.sans-serif']=['SimHei'] #用來正常顯示中文標簽 plt.rcParams['axes.unicode_minus']=False plt.plot(x,y,'-.',color = 'c',label="點擊量 單位/萬") plt.xticks(rotation=90) plt.legend(loc = "best")#圖例 plt.title("飛盧小說周點擊量趨勢圖") plt.xlabel("書名名",)#橫坐標名字 plt.ylabel("點擊數")#縱坐標名字 plt.show()
plt.plot(x,z,'-.',color = 'r',label="字數 單位/萬") plt.xticks(rotation=90) plt.legend(loc = "best")#圖例 plt.xlabel("書名名",)#橫坐標名字 plt.ylabel("點擊數")#縱坐標名字 plt.title("飛盧小說字數趨勢圖") plt.show()
# 柱狀圖 plt.bar(x,y,alpha=0.2, width=0.4, color='w', edgecolor='red', lw=3) plt.rcParams['font.sans-serif']=['SimHei'] #用來正常顯示中文標簽 plt.title("飛盧小說周點擊量柱狀圖") plt.xticks(rotation=90) plt.xlabel("小說名",)#橫坐標名字 plt.ylabel("點擊量")#縱坐標名字 plt.show()
# 柱狀圖 plt.bar(x,z,alpha=0.2, width=0.4, color='g', edgecolor='red', lw=3) plt.rcParams['font.sans-serif']=['SimHei'] #用來正常顯示中文標簽 plt.title("飛盧小說字數柱狀圖") plt.xticks(rotation=90) plt.xlabel("小說名",)#橫坐標名字 plt.ylabel("字數")#縱坐標名字 plt.show()
# 水平圖 plt.barh(x,y, alpha=0.2, height=0.4, color='b', edgecolor='gray',label="字數 單位/萬", lw=3) plt.title("飛盧小說周點擊量水平圖") plt.legend(loc = "best")#圖例 plt.xlabel("點擊量",)#橫坐標名字 plt.ylabel("小說名")#縱坐標名字 plt.show()
# 水平圖 plt.barh(x,z, alpha=0.2, height=0.4, color='g', edgecolor='gray',label="字數 單位/萬", lw=3) plt.title("飛盧小說字數水平圖") plt.legend(loc = "best")#圖例 plt.xlabel("點擊量",)#橫坐標名字 plt.ylabel("小說名")#縱坐標名字 plt.show()
# 散點圖 plt.scatter(x,y,color='w',marker='o',s=40,edgecolor='black',alpha=0.5) plt.xticks(rotation=90) plt.title("飛盧小說周點擊量散點圖") plt.xlabel("小說名",)#橫坐標名字 plt.ylabel("點擊量")#縱坐標名字 plt.show()
# 盒圖 plt.boxplot(y, # 值 vert=True, # true:縱向,false:橫向 showmeans=True) # 顯示均值 plt.title("飛盧小說周點擊量盒圖") plt.show()
雲詞:
1 import pandas as pd 2 import numpy as np 3 import wordcloud as wc 4 import random 5 import matplotlib.pyplot as plt 6 7 Fl = pd.read_csv(r'C:\Users\10950\Desktop\LP\Fl_pop.csv',error_bad_lines=False) 8 word_cloud = wc.WordCloud(width=500, # 詞雲圖寬 9 height=500, # 詞雲圖高 10 background_color='white', # 詞雲圖背景顏色,默認為白色 11 font_path='msyhbd.ttc', # 詞雲圖 字體(中文需要設定為本機有的中文字體) 12 max_font_size=400, # 最大字體,默認為200 13 random_state=50, # 為每個單詞返回一個PIL顏色 14 ) 15 text = Fl['book_class'] 16 Fl = [] 17 for i in text: 18 Fl.append(i) 19 text = " ".join(Fl) 20 21 word_cloud.generate(text) 22 plt.imshow(word_cloud) 23 plt.show()
總代碼:
1 import pandas as pd 2 import numpy as np 3 4 Fl = pd.read_csv(r'C:\Users\LP\Desktop\LP\Fl_pop.csv',error_bad_lines=False) 5 Fl.head(20) 6 7 # 重復值處理 8 Fl = Fl.drop_duplicates() 9 # Nan處理 10 Fl = Fl.dropna(axis = 0) 11 # 刪除無效行 12 del Fl['book_info'] 13 14 import matplotlib.pyplot as plt 15 # 可視化分析 16 # y的點擊數單位為萬 17 x = Fl['book_name'].head(20) 18 y = Fl['book_hits'].head(20) 19 z = Fl['book_word'].head(20) 20 plt.rcParams['font.sans-serif']=['SimHei'] #用來正常顯示中文標簽 21 plt.rcParams['axes.unicode_minus']=False 22 plt.plot(x,y,'-.',color = 'c',label="點擊量 單位/萬") 23 plt.xticks(rotation=90) 24 plt.legend(loc = "best")#圖例 25 plt.title("飛盧小說周點擊量趨勢圖") 26 plt.xlabel("書名名",)#橫坐標名字 27 plt.ylabel("點擊數")#縱坐標名字 28 plt.show() 29 30 plt.plot(x,z,'-.',color = 'r',label="字數 單位/萬") 31 plt.xticks(rotation=90) 32 plt.legend(loc = "best")#圖例 33 plt.xlabel("書名名",)#橫坐標名字 34 plt.ylabel("點擊數")#縱坐標名字 35 plt.title("飛盧小說字數趨勢圖") 36 plt.show() 37 38 # 柱狀圖 39 plt.bar(x,y,alpha=0.2, width=0.4, color='w', edgecolor='red', lw=3) 40 plt.rcParams['font.sans-serif']=['SimHei'] #用來正常顯示中文標簽 41 plt.title("飛盧小說周點擊量柱狀圖") 42 plt.xticks(rotation=90) 43 plt.xlabel("小說名",)#橫坐標名字 44 plt.ylabel("點擊量")#縱坐標名字 45 plt.show() 46 47 # 柱狀圖 48 plt.bar(x,z,alpha=0.2, width=0.4, color='g', edgecolor='red', lw=3) 49 plt.rcParams['font.sans-serif']=['SimHei'] #用來正常顯示中文標簽 50 plt.title("飛盧小說字數柱狀圖") 51 plt.xticks(rotation=90) 52 plt.xlabel("小說名",)#橫坐標名字 53 plt.ylabel("字數")#縱坐標名字 54 plt.show() 55 56 # 水平圖 57 plt.barh(x,y, alpha=0.2, height=0.4, color='b', edgecolor='gray',label='單位/億', lw=3) 58 plt.title("飛盧小說周點擊量水平圖") 59 plt.legend(loc = "best")#圖例 60 plt.xlabel("點擊量",)#橫坐標名字 61 plt.ylabel("小說名")#縱坐標名字 62 plt.show() 63 64 # 水平圖 65 plt.barh(x,z, alpha=0.2, height=0.4, color='g', edgecolor='gray',label='單位/億', lw=3) 66 plt.title("飛盧小說字數水平圖") 67 plt.legend(loc = "best")#圖例 68 plt.xlabel("點擊量",)#橫坐標名字 69 plt.ylabel("小說名")#縱坐標名字 70 plt.show() 71 72 # 散點圖 73 plt.scatter(x,y,color='w',marker='o',s=40,edgecolor='black',alpha=0.5) 74 plt.xticks(rotation=90) 75 plt.title("飛盧小說周點擊量散點圖") 76 plt.xlabel("小說名",)#橫坐標名字 77 plt.ylabel("點擊量")#縱坐標名字 78 plt.show() 79 80 # 盒圖 81 plt.boxplot(y, # 值 82 vert=True, # true:縱向,false:橫向 83 showmeans=True) # 顯示均值 84 plt.title("飛盧小說周點擊量盒圖") 85 plt.show()
import pandas as pd
import numpy as np
import wordcloud as wc
import random
import matplotlib.pyplot as plt
Fl = pd.read_csv(r'C:\Users\10950\Desktop\LP\Fl_pop.csv',error_bad_lines=False)
word_cloud = wc.WordCloud(width=500, # 詞雲圖寬
height=500, # 詞雲圖高
background_color='white', # 詞雲圖背景顏色,默認為白色
font_path='msyhbd.ttc', # 詞雲圖 字體(中文需要設定為本機有的中文字體)
max_font_size=400, # 最大字體,默認為200
random_state=50, # 為每個單詞返回一個PIL顏色
)
text = Fl['book_class']
Fl = []
for i in text:
Fl.append(i)
text = " ".join(Fl)
word_cloud.generate(text)
plt.imshow(word_cloud)
plt.show()
五、總結
經過此次可視化分析,可以得出周點擊數的情況,以及根據字體數量已經更新到什么程度,例如《武俠神奇皮膚系統》字數>500萬。達到預期目標!在此次設計過程中我收獲到了如何編寫爬蟲程序,雖然寫的很吃力,有些實現的功能不懂還需要去查看解決方案,當寫完項目的時候滿滿的成就感。不足之處在於代碼經驗太少了,在暑假時加強代碼強化。