一、選題背景
從古至今,喜愛閱讀-直是中華民族的優良傳統。自新中國以來,隨着社 從古至今,喜愛閱讀-直是中華民族的優良傳統.自新中國以來,隨着社
會的穩定,經濟的發展,科學的進步,人民物質生活水平和精神生活水平的提高,國民閱讀量和閱讀效率也有了一定的上升提高,國民閱讀量和閱讀效率也有了一定的上升。
數據來源:豆瓣讀書https://book.douban.com/tag/?view=type&icn=index-sorttags-hot
二、主題式網絡爬蟲設計方案
1、爬蟲名稱
豆瓣書籍信息爬取及可視化分析
2、爬蟲爬取的內容與數據特征分析
內容包括:書名、基本信息、評價方面的數據、星級、評分、評價人數、內容描述
數據皆由文字與數字組成
3、方案概述
對網站頁面結構進行分析,選中頁面中css節點的數據進行精准抓取,獲取自己需要的數據信息,然后保存為xlsx文件並對文件進行數據清洗和可視化分析。
三、主題頁面的結構特征分析
右鍵單擊選擇查看,找到要爬取的布局結構頁面標簽
四、爬蟲程序設計
1、數據爬取與采集
導入需要用到的庫
1 #導入requests模塊 2 import requests 3 import pandas as pd 4 from bs4 import BeautifulSoup 5 import re
由於書籍類別多樣,對類別進行分類
1 # 類別選擇 2 def choice_category(cookies): 3 headers = { 4 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3298.4 Safari/537.36' 5 } 6 url = 'https://book.douban.com/tag/?view=type&icn=index-sorttags-hot' 7 category_list = [] 8 res = requests.get(url, cookies=cookies, headers=headers) 9 soup = bs4.BeautifulSoup(res.text, 'html.parser') 10 # 找到所有分類列表 11 soup_list = soup.find('div', attrs={'class': 'article'}) 12 # 大類 13 first_class = soup_list.findAll('a', attrs={'class': 'tag-title-wrapper'}) 14 # 小類 15 second_class = soup_list.findAll('table', attrs={'class': 'tagCol'}) 16 # 進一步提取 17 first_class_list = [] 18 for fc in first_class: 19 first_class_list.append(fc.attrs['name']) 20 num = 0 21 for sc in second_class: 22 second_class_list = [] 23 sc = sc.findAll('a') 24 for sc_i in sc: 25 second_class_list.append(sc_i.string.strip()) 26 category_list.append([first_class_list[num], second_class_list]) 27 num += 1 28 return category_list
開始對想要爬取的內容進行爬蟲
1 # 書籍信息爬蟲 2 def book_spider(book_tag, cookies): 3 headers = { 4 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3298.4 Safari/537.36' 5 } 6 books_list = [] 7 page_num = 0 8 url = 'https://book.douban.com/tag/' + urllib.parse.quote(book_tag) + '?start=' + str(page_num*20) + '&type=T' 9 res = requests.get(url, cookies=cookies, headers=headers) 10 soup = bs4.BeautifulSoup(res.text, 'html.parser') 11 # 找到一共有多少頁 12 page_num_max = soup.find('div', attrs={'class': 'paginator'}) 13 page_num_max = page_num_max.findAll('a') 14 page_num_max = page_num_max[-2].string.strip() 15 page_num_max = int(page_num_max) 16 while True: 17 url = 'https://book.douban.com/tag/' + urllib.parse.quote(book_tag) + '?start=' + str(page_num*20) + '&type=T' 18 res = requests.get(url, cookies=cookies, headers=headers) 19 soup = bs4.BeautifulSoup(res.text, 'html.parser') 20 # 找到該頁所有書 21 soup_list = soup.findAll('li', attrs={'class': 'subject-item'}) 22 for book_info in soup_list: 23 # 書名 24 title = book_info.find('a', attrs={'title': True}) 25 book_url = title.attrs['href'] 26 title = title.attrs['title'] 27 # 基本信息 28 basic_info = book_info.find('div', attrs={'class': 'pub'}).string.strip() 29 basic_info_list = basic_info.split('/') 30 try: 31 author_info = '/'.join(basic_info_list[0: -3]) 32 except: 33 author_info = '暫無' 34 try: 35 pub_info = '/'.join(basic_info_list[-3: ]) 36 except: 37 pub_info = '暫無' 38 # 評價方面的數據 39 evaluate_info = book_info.find('div', attrs={'class': 'star clearfix'}) 40 # 星級 41 try: 42 allstar = evaluate_info.find('span', attrs={'class': True}) 43 if (allstar.attrs['class'])[0][-1] == '1': 44 allstar = (allstar.attrs['class'])[0][-1] 45 else: 46 allstar = (allstar.attrs['class'])[0][-2] + '.' + (allstar.attrs['class'])[0][-1] 47 except: 48 allstar = '0.0' 49 # 評分 50 try: 51 rating_nums = evaluate_info.find('span', attrs={'class': 'rating_nums'}).string.strip() 52 except: 53 rating_nums = '0.0' 54 # 評價人數 55 try: 56 people_num = evaluate_info.find('span', attrs={'class': 'pl'}).string.strip() 57 people_num = people_num[1: -4] 58 except: 59 people_num = '0' 60 # 內容描述 61 try: 62 description = book_info.find('p').string.strip() 63 except: 64 description = '暫無' 65 # 信息整理 66 books_list.append([title, author_info, pub_info, allstar, rating_nums, people_num, description, book_url]) 67 print('第%d頁信息采集完畢,共%d頁' % (page_num+1, page_num_max)) 68 time.sleep(0.5) 69 page_num += 1 70 if page_num == page_num_max: 71 break 72 return books_list
將數據保存到excel中
1 # 結果保存到excel中 2 def save_to_excel(books_list, excel_name): 3 wb = Workbook() 4 ws = wb.active 5 ws.append(['序號', '書名', '作者/譯者', '出版信息', '星級', '評分', '評價人數', '簡介', '豆瓣鏈接']) 6 count = 1 7 for bl in books_list: 8 ws.append([count, bl[0], bl[1], bl[2], bl[3], bl[4], bl[5], bl[6], bl[7]]) 9 count += 1 10 wb.save('./results/' + excel_name + '.xlsx')
2、對數據進行清理和處理
導入需要用到的庫
1 #數據處理 2 import pandas as pd
隨機讀取一個類別的書籍數據(這里我舉例編程類型的書籍)
1 #讀取數據 2 db_read = pd.read_excel("編程.xlsx",index_col=0)
1 #數據清理 2 #重復值的處理 3 titanic = titanic.drop_duplicates() 4 titanic.head()
3、數據分析和可視化
讀取數據查看前五行信息
1 #可視化 2 from matplotlib import pyplot as plt 3 #可視化 4 import seaborn as sns 5 #讀取數據 6 db_read = pd.read_excel("編程.xlsx",index_col=0) 7 #查看數據信息 8 db_read.info() 9 #查看前五條數據 10 db_read.head()
以評分來降序排序
1 #將整個數組以評分來排序 2 desc_data = db_read.sort_values(by="評分",ascending=False) 3 desc_data
查看編程類型的書籍評分分布,可以看出主要分布在6~9.5分的范圍
1 #可視化 2 #評分主要分布區域 3 sns.distplot(db_read["評分"])
取出前100條,觀察星級分布
1 #取出前一百條,體現星級分布 2 top_100 = desc_data.iloc[:100,:] 3 sns.catplot(x="星級",data = top_100,kind="count",height=10) 4 plt.xticks(rotation=90) 5 plt.show()
對書籍熱門短評生成詞雲
1 # 將書籍熱門短評制作為詞雲 2 def Book_Blurb_wordcloud(url, cookies): 3 headers = { 4 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3298.4 Safari/537.36' 5 } 6 page_num = 0 7 hot_comment_list = [] 8 while True: 9 page_num += 1 10 url = url + 'comments/hot?p={}'.format(page_num) 11 res = requests.get(url, cookies=cookies, headers=headers) 12 soup = bs4.BeautifulSoup(res.text, 'html.parser') 13 # 找到該頁所有短評 14 soup_list = soup.findAll('p', attrs={'class': 'comment-content'}) 15 for com in soup_list: 16 comment = com.string.strip() 17 hot_comment_list.append(comment) 18 print('第%d頁短評采集完畢' % page_num) 19 if page_num > 19: 20 print('前20頁短評全部采集完畢,開始制作詞雲') 21 break 22 all_comments = '' 23 for hc in hot_comment_list: 24 all_comments += hc 25 all_comments = filterword(all_comments) 26 words = ' '.join(jieba.cut(all_comments)) 27 # 這里設置字體路徑 28 Words_Cloud = WordCloud(font_path="simkai.ttf").generate(words) 29 Words_Cloud.to_file('Book_Blurb.jpg')
對短評情感進行分析
1 # 短評情感分析 2 def emotion_analysis(url, cookies): 3 headers = { 4 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3298.4 Safari/537.36' 5 } 6 page_num = 0 7 hot_comment_list = [] 8 while True: 9 page_num += 1 10 url = url + 'comments/hot?p={}'.format(page_num) 11 res = requests.get(url, cookies=cookies, headers=headers) 12 soup = bs4.BeautifulSoup(res.text, 'html.parser') 13 # 找到該頁所有短評 14 soup_list = soup.findAll('p', attrs={'class': 'comment-content'}) 15 for com in soup_list: 16 comment = com.string.strip() 17 hot_comment_list.append(comment) 18 print('第%d頁短評采集完畢' % page_num) 19 if page_num > 19: 20 print('前20頁短評全部采集完畢,開始情感分析') 21 break 22 marks_list = [] 23 for com in hot_comment_list: 24 mark = SnowNLP(com) 25 marks_list.append(mark.sentiments) 26 plt.hist(marks_list, bins=np.arange(0, 1, 0.02)) 27 plt.show()
4、根據數據之間的關系,分析兩個變量之間的相關系數,畫出散點圖,並建立變量之間的回歸方程
分析評價和評價人數兩個變量之間的關系繪制回歸圖
1 #插入所需要的庫 2 import pandas as pd 3 import pandas as np 4 import sklearn 5 #將本地的編程.xlsx數據加載到DataFrame中 6 boston_df=pd.read_excel('編程.xlsx',index_col=0) 7 boston_df.head() 8 #繪制回歸圖 9 import seaborn as sns 10 sns.regplot(boston_df.評價人數,boston_df.評分)
5、完整程序代碼
1 #導入requests模塊 2 import requests 3 import pandas as pd 4 from bs4 import BeautifulSoup 5 import re 6 7 # 類別選擇 8 def choice_category(cookies): 9 headers = { 10 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3298.4 Safari/537.36' 11 } 12 url = 'https://book.douban.com/tag/?view=type&icn=index-sorttags-hot' 13 category_list = [] 14 res = requests.get(url, cookies=cookies, headers=headers) 15 soup = bs4.BeautifulSoup(res.text, 'html.parser') 16 # 找到所有分類列表 17 soup_list = soup.find('div', attrs={'class': 'article'}) 18 # 大類 19 first_class = soup_list.findAll('a', attrs={'class': 'tag-title-wrapper'}) 20 # 小類 21 second_class = soup_list.findAll('table', attrs={'class': 'tagCol'}) 22 # 進一步提取 23 first_class_list = [] 24 for fc in first_class: 25 first_class_list.append(fc.attrs['name']) 26 num = 0 27 for sc in second_class: 28 second_class_list = [] 29 sc = sc.findAll('a') 30 for sc_i in sc: 31 second_class_list.append(sc_i.string.strip()) 32 category_list.append([first_class_list[num], second_class_list]) 33 num += 1 34 return category_list 35 36 37 # 書籍信息爬蟲 38 def book_spider(book_tag, cookies): 39 headers = { 40 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3298.4 Safari/537.36' 41 } 42 books_list = [] 43 page_num = 0 44 url = 'https://book.douban.com/tag/' + urllib.parse.quote(book_tag) + '?start=' + str(page_num*20) + '&type=T' 45 res = requests.get(url, cookies=cookies, headers=headers) 46 soup = bs4.BeautifulSoup(res.text, 'html.parser') 47 # 找到一共有多少頁 48 page_num_max = soup.find('div', attrs={'class': 'paginator'}) 49 page_num_max = page_num_max.findAll('a') 50 page_num_max = page_num_max[-2].string.strip() 51 page_num_max = int(page_num_max) 52 while True: 53 url = 'https://book.douban.com/tag/' + urllib.parse.quote(book_tag) + '?start=' + str(page_num*20) + '&type=T' 54 res = requests.get(url, cookies=cookies, headers=headers) 55 soup = bs4.BeautifulSoup(res.text, 'html.parser') 56 # 找到該頁所有書 57 soup_list = soup.findAll('li', attrs={'class': 'subject-item'}) 58 for book_info in soup_list: 59 # 書名 60 title = book_info.find('a', attrs={'title': True}) 61 book_url = title.attrs['href'] 62 title = title.attrs['title'] 63 # 基本信息 64 basic_info = book_info.find('div', attrs={'class': 'pub'}).string.strip() 65 basic_info_list = basic_info.split('/') 66 try: 67 author_info = '/'.join(basic_info_list[0: -3]) 68 except: 69 author_info = '暫無' 70 try: 71 pub_info = '/'.join(basic_info_list[-3: ]) 72 except: 73 pub_info = '暫無' 74 # 評價方面的數據 75 evaluate_info = book_info.find('div', attrs={'class': 'star clearfix'}) 76 # 星級 77 try: 78 allstar = evaluate_info.find('span', attrs={'class': True}) 79 if (allstar.attrs['class'])[0][-1] == '1': 80 allstar = (allstar.attrs['class'])[0][-1] 81 else: 82 allstar = (allstar.attrs['class'])[0][-2] + '.' + (allstar.attrs['class'])[0][-1] 83 except: 84 allstar = '0.0' 85 # 評分 86 try: 87 rating_nums = evaluate_info.find('span', attrs={'class': 'rating_nums'}).string.strip() 88 except: 89 rating_nums = '0.0' 90 # 評價人數 91 try: 92 people_num = evaluate_info.find('span', attrs={'class': 'pl'}).string.strip() 93 people_num = people_num[1: -4] 94 except: 95 people_num = '0' 96 # 內容描述 97 try: 98 description = book_info.find('p').string.strip() 99 except: 100 description = '暫無' 101 # 信息整理 102 books_list.append([title, author_info, pub_info, allstar, rating_nums, people_num, description, book_url]) 103 print('第%d頁信息采集完畢,共%d頁' % (page_num+1, page_num_max)) 104 time.sleep(0.5) 105 page_num += 1 106 if page_num == page_num_max: 107 break 108 return books_list 109 110 111 # 結果保存到excel中 112 def save_to_excel(books_list, excel_name): 113 wb = Workbook() 114 ws = wb.active 115 ws.append(['序號', '書名', '作者/譯者', '出版信息', '星級', '評分', '評價人數', '簡介', '豆瓣鏈接']) 116 count = 1 117 for bl in books_list: 118 ws.append([count, bl[0], bl[1], bl[2], bl[3], bl[4], bl[5], bl[6], bl[7]]) 119 count += 1 120 wb.save('./results/' + excel_name + '.xlsx') 121 122 123 #數據處理 124 import pandas as pd 125 #讀取數據 126 db_read = pd.read_excel("編程.xlsx",index_col=0) 127 #數據清理 128 #重復值的處理 129 titanic = titanic.drop_duplicates() 130 titanic.head() 131 132 #可視化 133 from matplotlib import pyplot as plt 134 #可視化 135 import seaborn as sns 136 #讀取數據 137 db_read = pd.read_excel("編程.xlsx",index_col=0) 138 #查看數據信息 139 db_read.info() 140 #查看前五條數據 141 db_read.head() 142 143 #將整個數組以評分來排序 144 desc_data = db_read.sort_values(by="評分",ascending=False) 145 desc_data 146 147 #可視化 148 #評分主要分布區域 149 sns.distplot(db_read["評分"]) 150 151 #取出前一百條,體現星級分布 152 top_100 = desc_data.iloc[:100,:] 153 sns.catplot(x="星級",data = top_100,kind="count",height=10) 154 plt.xticks(rotation=90) 155 plt.show() 156 157 # 將書籍熱門短評制作為詞雲 158 def Book_Blurb_wordcloud(url, cookies): 159 headers = { 160 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3298.4 Safari/537.36' 161 } 162 page_num = 0 163 hot_comment_list = [] 164 while True: 165 page_num += 1 166 url = url + 'comments/hot?p={}'.format(page_num) 167 res = requests.get(url, cookies=cookies, headers=headers) 168 soup = bs4.BeautifulSoup(res.text, 'html.parser') 169 # 找到該頁所有短評 170 soup_list = soup.findAll('p', attrs={'class': 'comment-content'}) 171 for com in soup_list: 172 comment = com.string.strip() 173 hot_comment_list.append(comment) 174 print('第%d頁短評采集完畢' % page_num) 175 if page_num > 19: 176 print('前20頁短評全部采集完畢,開始制作詞雲') 177 break 178 all_comments = '' 179 for hc in hot_comment_list: 180 all_comments += hc 181 all_comments = filterword(all_comments) 182 words = ' '.join(jieba.cut(all_comments)) 183 # 這里設置字體路徑 184 Words_Cloud = WordCloud(font_path="simkai.ttf").generate(words) 185 Words_Cloud.to_file('Book_Blurb.jpg') 186 187 188 # 短評情感分析 189 def emotion_analysis(url, cookies): 190 headers = { 191 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3298.4 Safari/537.36' 192 } 193 page_num = 0 194 hot_comment_list = [] 195 while True: 196 page_num += 1 197 url = url + 'comments/hot?p={}'.format(page_num) 198 res = requests.get(url, cookies=cookies, headers=headers) 199 soup = bs4.BeautifulSoup(res.text, 'html.parser') 200 # 找到該頁所有短評 201 soup_list = soup.findAll('p', attrs={'class': 'comment-content'}) 202 for com in soup_list: 203 comment = com.string.strip() 204 hot_comment_list.append(comment) 205 print('第%d頁短評采集完畢' % page_num) 206 if page_num > 19: 207 print('前20頁短評全部采集完畢,開始情感分析') 208 break 209 marks_list = [] 210 for com in hot_comment_list: 211 mark = SnowNLP(com) 212 marks_list.append(mark.sentiments) 213 plt.hist(marks_list, bins=np.arange(0, 1, 0.02)) 214 plt.show() 215 216 #數據處理 217 import pandas as pd 218 #可視化 219 from matplotlib import pyplot as plt 220 #可視化 221 import seaborn as sns 222 import sklearn 223 from sklearn.linear_model import LinearRegression 224 #繪制回歸圖 225 boston_df=pd.read_excel('編程.xlsx',index_col=0) 226 boston_df.head()
五、總結
經過本次對豆瓣書籍的爬取及分析,了解到現在人們對書籍的類別需求,也體會到了社會發展帶來的便利。
無論是紙質書還是電子書,多讀書呀!