Python爬取豆瓣書籍信息及分析


一、選題背景

  從古至今,喜愛閱讀-直是中華民族的優良傳統。自新中國以來,隨着社 從古至今,喜愛閱讀-直是中華民族的優良傳統.自新中國以來,隨着社
會的穩定,經濟的發展,科學的進步,人民物質生活水平和精神生活水平的提高,國民閱讀量和閱讀效率也有了一定的上升提高,國民閱讀量和閱讀效率也有了一定的上升。

  數據來源:豆瓣讀書https://book.douban.com/tag/?view=type&icn=index-sorttags-hot

二、主題式網絡爬蟲設計方案

1、爬蟲名稱

  豆瓣書籍信息爬取及可視化分析

2、爬蟲爬取的內容與數據特征分析

  內容包括:書名、基本信息、評價方面的數據、星級、評分、評價人數、內容描述

  數據皆由文字與數字組成

 

3、方案概述

  對網站頁面結構進行分析,選中頁面中css節點的數據進行精准抓取,獲取自己需要的數據信息,然后保存為xlsx文件並對文件進行數據清洗和可視化分析。

 

三、主題頁面的結構特征分析

  右鍵單擊選擇查看,找到要爬取的布局結構頁面標簽

 

 

四、爬蟲程序設計

1、數據爬取與采集

  導入需要用到的庫

1 #導入requests模塊
2 import requests 
3 import pandas as pd
4 from bs4 import BeautifulSoup
5 import re

  由於書籍類別多樣,對類別進行分類

 1 #  類別選擇
 2 def choice_category(cookies):
 3     headers = {
 4         'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3298.4 Safari/537.36'
 5         }
 6     url = 'https://book.douban.com/tag/?view=type&icn=index-sorttags-hot'
 7     category_list = []
 8     res = requests.get(url, cookies=cookies, headers=headers)
 9     soup = bs4.BeautifulSoup(res.text, 'html.parser')
10     # 找到所有分類列表
11     soup_list = soup.find('div', attrs={'class': 'article'})
12     # 大類
13     first_class = soup_list.findAll('a', attrs={'class': 'tag-title-wrapper'})
14     # 小類
15     second_class = soup_list.findAll('table', attrs={'class': 'tagCol'})
16     # 進一步提取
17     first_class_list = []
18     for fc in first_class:
19         first_class_list.append(fc.attrs['name'])
20     num = 0
21     for sc in second_class:
22         second_class_list = []
23         sc = sc.findAll('a')
24         for sc_i in sc:
25             second_class_list.append(sc_i.string.strip())
26         category_list.append([first_class_list[num], second_class_list])
27         num += 1
28     return category_list

  開始對想要爬取的內容進行爬蟲

 1 # 書籍信息爬蟲
 2 def book_spider(book_tag, cookies):
 3     headers = {
 4         'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3298.4 Safari/537.36'
 5         }
 6     books_list = []
 7     page_num = 0
 8     url = 'https://book.douban.com/tag/' + urllib.parse.quote(book_tag) + '?start=' + str(page_num*20) + '&type=T'
 9     res = requests.get(url, cookies=cookies, headers=headers)
10     soup = bs4.BeautifulSoup(res.text, 'html.parser')
11     # 找到一共有多少頁
12     page_num_max = soup.find('div', attrs={'class': 'paginator'})
13     page_num_max = page_num_max.findAll('a')
14     page_num_max = page_num_max[-2].string.strip()
15     page_num_max = int(page_num_max)
16     while True:
17         url = 'https://book.douban.com/tag/' + urllib.parse.quote(book_tag) + '?start=' + str(page_num*20) + '&type=T'
18         res = requests.get(url, cookies=cookies, headers=headers)
19         soup = bs4.BeautifulSoup(res.text, 'html.parser')
20         # 找到該頁所有書
21         soup_list = soup.findAll('li', attrs={'class': 'subject-item'})
22         for book_info in soup_list:
23             # 書名
24             title = book_info.find('a', attrs={'title': True})
25             book_url = title.attrs['href']
26             title = title.attrs['title']
27             # 基本信息
28             basic_info = book_info.find('div', attrs={'class': 'pub'}).string.strip()
29             basic_info_list = basic_info.split('/')
30             try:
31                 author_info = '/'.join(basic_info_list[0: -3])
32             except:
33                 author_info = '暫無'
34             try:
35                 pub_info = '/'.join(basic_info_list[-3: ])
36             except:
37                 pub_info = '暫無'
38             # 評價方面的數據
39             evaluate_info = book_info.find('div', attrs={'class': 'star clearfix'})
40             # 星級
41             try:
42                 allstar = evaluate_info.find('span', attrs={'class': True})
43                 if (allstar.attrs['class'])[0][-1] == '1':
44                     allstar = (allstar.attrs['class'])[0][-1]
45                 else:
46                     allstar = (allstar.attrs['class'])[0][-2] + '.' + (allstar.attrs['class'])[0][-1]
47             except:
48                 allstar = '0.0'
49             # 評分
50             try:
51                 rating_nums = evaluate_info.find('span', attrs={'class': 'rating_nums'}).string.strip()
52             except:
53                 rating_nums = '0.0'
54             # 評價人數
55             try:
56                 people_num = evaluate_info.find('span', attrs={'class': 'pl'}).string.strip()
57                 people_num = people_num[1: -4]
58             except:
59                 people_num = '0'
60             # 內容描述
61             try:
62                 description = book_info.find('p').string.strip()
63             except:
64                 description = '暫無'
65             # 信息整理
66             books_list.append([title, author_info, pub_info, allstar, rating_nums, people_num, description, book_url])
67         print('第%d頁信息采集完畢,共%d頁' % (page_num+1, page_num_max))
68         time.sleep(0.5)
69         page_num += 1
70         if page_num == page_num_max:
71             break
72     return books_list

  將數據保存到excel中

 1 # 結果保存到excel中
 2 def save_to_excel(books_list, excel_name):
 3     wb = Workbook()
 4     ws = wb.active
 5     ws.append(['序號', '書名', '作者/譯者', '出版信息', '星級', '評分', '評價人數', '簡介', '豆瓣鏈接'])
 6     count = 1
 7     for bl in books_list:
 8         ws.append([count, bl[0], bl[1], bl[2], bl[3], bl[4], bl[5], bl[6], bl[7]])
 9         count += 1
10     wb.save('./results/' + excel_name + '.xlsx')

 

2、對數據進行清理和處理

  導入需要用到的庫

1 #數據處理
2 import pandas as pd

   隨機讀取一個類別的書籍數據(這里我舉例編程類型的書籍)

1 #讀取數據
2 db_read = pd.read_excel("編程.xlsx",index_col=0)
1 #數據清理
2 #重復值的處理
3 titanic = titanic.drop_duplicates()
4 titanic.head()

3、數據分析和可視化

  讀取數據查看前五行信息

 1 #可視化
 2 from matplotlib import pyplot as plt
 3 #可視化
 4 import seaborn as sns
 5 #讀取數據
 6 db_read = pd.read_excel("編程.xlsx",index_col=0)
 7 #查看數據信息
 8 db_read.info()
 9 #查看前五條數據
10 db_read.head()

 

  以評分來降序排序

1 #將整個數組以評分來排序
2 desc_data = db_read.sort_values(by="評分",ascending=False)
3 desc_data

  查看編程類型的書籍評分分布,可以看出主要分布在6~9.5分的范圍

1 #可視化
2 #評分主要分布區域
3 sns.distplot(db_read["評分"])

  取出前100條,觀察星級分布

1 #取出前一百條,體現星級分布
2 top_100 = desc_data.iloc[:100,:]
3 sns.catplot(x="星級",data = top_100,kind="count",height=10)
4 plt.xticks(rotation=90)
5 plt.show()

  對書籍熱門短評生成詞雲

 1 # 將書籍熱門短評制作為詞雲
 2 def Book_Blurb_wordcloud(url, cookies):
 3     headers = {
 4         'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3298.4 Safari/537.36'
 5         }
 6     page_num = 0
 7     hot_comment_list = []
 8     while True:
 9         page_num += 1
10         url = url + 'comments/hot?p={}'.format(page_num)
11         res = requests.get(url, cookies=cookies, headers=headers)
12         soup = bs4.BeautifulSoup(res.text, 'html.parser')
13         # 找到該頁所有短評
14         soup_list = soup.findAll('p', attrs={'class': 'comment-content'})
15         for com in soup_list:
16             comment = com.string.strip()
17             hot_comment_list.append(comment)
18         print('第%d頁短評采集完畢' % page_num)
19         if page_num > 19:
20             print('前20頁短評全部采集完畢,開始制作詞雲')
21             break
22     all_comments = ''
23     for hc in hot_comment_list:
24         all_comments += hc
25     all_comments = filterword(all_comments)
26     words = ' '.join(jieba.cut(all_comments))
27     # 這里設置字體路徑
28     Words_Cloud = WordCloud(font_path="simkai.ttf").generate(words)
29     Words_Cloud.to_file('Book_Blurb.jpg')

  對短評情感進行分析

 1 # 短評情感分析
 2 def emotion_analysis(url, cookies):
 3     headers = {
 4         'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3298.4 Safari/537.36'
 5         }
 6     page_num = 0
 7     hot_comment_list = []
 8     while True:
 9         page_num += 1
10         url = url + 'comments/hot?p={}'.format(page_num)
11         res = requests.get(url, cookies=cookies, headers=headers)
12         soup = bs4.BeautifulSoup(res.text, 'html.parser')
13         # 找到該頁所有短評
14         soup_list = soup.findAll('p', attrs={'class': 'comment-content'})
15         for com in soup_list:
16             comment = com.string.strip()
17             hot_comment_list.append(comment)
18         print('第%d頁短評采集完畢' % page_num)
19         if page_num > 19:
20             print('前20頁短評全部采集完畢,開始情感分析')
21             break
22     marks_list = []
23     for com in hot_comment_list:
24         mark = SnowNLP(com)
25         marks_list.append(mark.sentiments)
26     plt.hist(marks_list, bins=np.arange(0, 1, 0.02))
27     plt.show()

 

4、根據數據之間的關系,分析兩個變量之間的相關系數,畫出散點圖,並建立變量之間的回歸方程

  分析評價和評價人數兩個變量之間的關系繪制回歸圖

 1 #插入所需要的庫
 2 import pandas as pd
 3 import pandas as np
 4 import sklearn
 5 #將本地的編程.xlsx數據加載到DataFrame中
 6 boston_df=pd.read_excel('編程.xlsx',index_col=0)
 7 boston_df.head()
 8 #繪制回歸圖
 9 import seaborn as sns
10 sns.regplot(boston_df.評價人數,boston_df.評分)

 

 

5、完整程序代碼

 

  1 #導入requests模塊
  2 import requests 
  3 import pandas as pd
  4 from bs4 import BeautifulSoup
  5 import re
  6 
  7 #  類別選擇
  8 def choice_category(cookies):
  9     headers = {
 10         'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3298.4 Safari/537.36'
 11         }
 12     url = 'https://book.douban.com/tag/?view=type&icn=index-sorttags-hot'
 13     category_list = []
 14     res = requests.get(url, cookies=cookies, headers=headers)
 15     soup = bs4.BeautifulSoup(res.text, 'html.parser')
 16     # 找到所有分類列表
 17     soup_list = soup.find('div', attrs={'class': 'article'})
 18     # 大類
 19     first_class = soup_list.findAll('a', attrs={'class': 'tag-title-wrapper'})
 20     # 小類
 21     second_class = soup_list.findAll('table', attrs={'class': 'tagCol'})
 22     # 進一步提取
 23     first_class_list = []
 24     for fc in first_class:
 25         first_class_list.append(fc.attrs['name'])
 26     num = 0
 27     for sc in second_class:
 28         second_class_list = []
 29         sc = sc.findAll('a')
 30         for sc_i in sc:
 31             second_class_list.append(sc_i.string.strip())
 32         category_list.append([first_class_list[num], second_class_list])
 33         num += 1
 34     return category_list
 35 
 36 
 37 # 書籍信息爬蟲
 38 def book_spider(book_tag, cookies):
 39     headers = {
 40         'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3298.4 Safari/537.36'
 41         }
 42     books_list = []
 43     page_num = 0
 44     url = 'https://book.douban.com/tag/' + urllib.parse.quote(book_tag) + '?start=' + str(page_num*20) + '&type=T'
 45     res = requests.get(url, cookies=cookies, headers=headers)
 46     soup = bs4.BeautifulSoup(res.text, 'html.parser')
 47     # 找到一共有多少頁
 48     page_num_max = soup.find('div', attrs={'class': 'paginator'})
 49     page_num_max = page_num_max.findAll('a')
 50     page_num_max = page_num_max[-2].string.strip()
 51     page_num_max = int(page_num_max)
 52     while True:
 53         url = 'https://book.douban.com/tag/' + urllib.parse.quote(book_tag) + '?start=' + str(page_num*20) + '&type=T'
 54         res = requests.get(url, cookies=cookies, headers=headers)
 55         soup = bs4.BeautifulSoup(res.text, 'html.parser')
 56         # 找到該頁所有書
 57         soup_list = soup.findAll('li', attrs={'class': 'subject-item'})
 58         for book_info in soup_list:
 59             # 書名
 60             title = book_info.find('a', attrs={'title': True})
 61             book_url = title.attrs['href']
 62             title = title.attrs['title']
 63             # 基本信息
 64             basic_info = book_info.find('div', attrs={'class': 'pub'}).string.strip()
 65             basic_info_list = basic_info.split('/')
 66             try:
 67                 author_info = '/'.join(basic_info_list[0: -3])
 68             except:
 69                 author_info = '暫無'
 70             try:
 71                 pub_info = '/'.join(basic_info_list[-3: ])
 72             except:
 73                 pub_info = '暫無'
 74             # 評價方面的數據
 75             evaluate_info = book_info.find('div', attrs={'class': 'star clearfix'})
 76             # 星級
 77             try:
 78                 allstar = evaluate_info.find('span', attrs={'class': True})
 79                 if (allstar.attrs['class'])[0][-1] == '1':
 80                     allstar = (allstar.attrs['class'])[0][-1]
 81                 else:
 82                     allstar = (allstar.attrs['class'])[0][-2] + '.' + (allstar.attrs['class'])[0][-1]
 83             except:
 84                 allstar = '0.0'
 85             # 評分
 86             try:
 87                 rating_nums = evaluate_info.find('span', attrs={'class': 'rating_nums'}).string.strip()
 88             except:
 89                 rating_nums = '0.0'
 90             # 評價人數
 91             try:
 92                 people_num = evaluate_info.find('span', attrs={'class': 'pl'}).string.strip()
 93                 people_num = people_num[1: -4]
 94             except:
 95                 people_num = '0'
 96             # 內容描述
 97             try:
 98                 description = book_info.find('p').string.strip()
 99             except:
100                 description = '暫無'
101             # 信息整理
102             books_list.append([title, author_info, pub_info, allstar, rating_nums, people_num, description, book_url])
103         print('第%d頁信息采集完畢,共%d頁' % (page_num+1, page_num_max))
104         time.sleep(0.5)
105         page_num += 1
106         if page_num == page_num_max:
107             break
108     return books_list
109 
110 
111 # 結果保存到excel中
112 def save_to_excel(books_list, excel_name):
113     wb = Workbook()
114     ws = wb.active
115     ws.append(['序號', '書名', '作者/譯者', '出版信息', '星級', '評分', '評價人數', '簡介', '豆瓣鏈接'])
116     count = 1
117     for bl in books_list:
118         ws.append([count, bl[0], bl[1], bl[2], bl[3], bl[4], bl[5], bl[6], bl[7]])
119         count += 1
120     wb.save('./results/' + excel_name + '.xlsx')
121 
122 
123 #數據處理
124 import pandas as pd
125 #讀取數據
126 db_read = pd.read_excel("編程.xlsx",index_col=0)
127 #數據清理
128 #重復值的處理
129 titanic = titanic.drop_duplicates()
130 titanic.head()
131 
132 #可視化
133 from matplotlib import pyplot as plt
134 #可視化
135 import seaborn as sns
136 #讀取數據
137 db_read = pd.read_excel("編程.xlsx",index_col=0)
138 #查看數據信息
139 db_read.info()
140 #查看前五條數據
141 db_read.head()
142 
143 #將整個數組以評分來排序
144 desc_data = db_read.sort_values(by="評分",ascending=False)
145 desc_data
146 
147 #可視化
148 #評分主要分布區域
149 sns.distplot(db_read["評分"])
150 
151 #取出前一百條,體現星級分布
152 top_100 = desc_data.iloc[:100,:]
153 sns.catplot(x="星級",data = top_100,kind="count",height=10)
154 plt.xticks(rotation=90)
155 plt.show()
156 
157 # 將書籍熱門短評制作為詞雲
158 def Book_Blurb_wordcloud(url, cookies):
159     headers = {
160         'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3298.4 Safari/537.36'
161         }
162     page_num = 0
163     hot_comment_list = []
164     while True:
165         page_num += 1
166         url = url + 'comments/hot?p={}'.format(page_num)
167         res = requests.get(url, cookies=cookies, headers=headers)
168         soup = bs4.BeautifulSoup(res.text, 'html.parser')
169         # 找到該頁所有短評
170         soup_list = soup.findAll('p', attrs={'class': 'comment-content'})
171         for com in soup_list:
172             comment = com.string.strip()
173             hot_comment_list.append(comment)
174         print('第%d頁短評采集完畢' % page_num)
175         if page_num > 19:
176             print('前20頁短評全部采集完畢,開始制作詞雲')
177             break
178     all_comments = ''
179     for hc in hot_comment_list:
180         all_comments += hc
181     all_comments = filterword(all_comments)
182     words = ' '.join(jieba.cut(all_comments))
183     # 這里設置字體路徑
184     Words_Cloud = WordCloud(font_path="simkai.ttf").generate(words)
185     Words_Cloud.to_file('Book_Blurb.jpg')
186 
187 
188 # 短評情感分析
189 def emotion_analysis(url, cookies):
190     headers = {
191         'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3298.4 Safari/537.36'
192         }
193     page_num = 0
194     hot_comment_list = []
195     while True:
196         page_num += 1
197         url = url + 'comments/hot?p={}'.format(page_num)
198         res = requests.get(url, cookies=cookies, headers=headers)
199         soup = bs4.BeautifulSoup(res.text, 'html.parser')
200         # 找到該頁所有短評
201         soup_list = soup.findAll('p', attrs={'class': 'comment-content'})
202         for com in soup_list:
203             comment = com.string.strip()
204             hot_comment_list.append(comment)
205         print('第%d頁短評采集完畢' % page_num)
206         if page_num > 19:
207             print('前20頁短評全部采集完畢,開始情感分析')
208             break
209     marks_list = []
210     for com in hot_comment_list:
211         mark = SnowNLP(com)
212         marks_list.append(mark.sentiments)
213     plt.hist(marks_list, bins=np.arange(0, 1, 0.02))
214     plt.show()
215 
216 #數據處理
217 import pandas as pd
218 #可視化
219 from matplotlib import pyplot as plt
220 #可視化
221 import seaborn as sns
222 import sklearn
223 from sklearn.linear_model import LinearRegression
224 #繪制回歸圖
225 boston_df=pd.read_excel('編程.xlsx',index_col=0)
226 boston_df.head()

 

 

 

五、總結

  經過本次對豆瓣書籍的爬取及分析,了解到現在人們對書籍的類別需求,也體會到了社會發展帶來的便利。

  無論是紙質書還是電子書,多讀書呀!

 


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM