(一)、選題的背景
在這個科技飛速發展的時代,網上購物十分流行,對於一些商品的好壞,可以加以評論。做這一選題的目的主要是運用爬蟲來爬取商品的評論信息,並加以分析。爬取京東商品的評論數據並加以分析,通過顧客對商品的客觀評價,總結得出顧客對商品評論的熱詞,得出顧客對某一類商品的關注點;對商品的打分與時間間隔的關系等相關信息,得出產品的質量是否有保障,這產品是否實用等。
(二)、主題式網絡爬蟲設計方案
1.主題式網絡爬蟲名稱
爬取京東商品評論信息
2.主題式網絡爬蟲爬取的內容與數據特征分析
爬取內容:京東商品的熱門評論十大關鍵詞、打分、熱詞。
數據特征分析:網頁文本。
3.主題式網絡爬蟲設計方案概述(包括實現思路與技術難點)
爬取京東售賣的商品下的評論信息,得出其關鍵詞,並分析顧客打分與打分時間的關系。
主要用到的庫有python的requests、json、csv、time、pandas、jieba、matplotlib.pyplot、numpy。
(三)、主題頁面的結構特征分析
1.主題頁面的結構與特征分析
2.Htmls 頁面解析
每條評論的文字信息都有這個標簽:<p class="comment-con">開頭,每條評論的時間都有<span>開頭,</span>結尾。
3.節點(標簽)查找方法與遍歷方法
(必要時畫出節點樹結構)
查找方法:find 遍歷方法:for循環
(四)、網絡爬蟲程序設計
1.數據爬取與采集
1 def jd_crawl_comment(item_id,pagenum): 2 3 list_ =[] 4 start_page = 1 5 end_page = pagenum 6 for p in range(start_page, end_page + 1): 7 print(p) 8 #productid = 3048505 9 url = 'https://club.jd.com/comment/productPageComments.action?callback=fetchJSON_comment98vv995&productId='+str(item_id)+'&score=0&sortType=5&page={}&pageSize=10&isShadowSku=0&fold=1' 10 url = url.format(p) 11 print(url) 12 #仿造請求頭,騙過瀏覽器 13 headers = { 14 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36', 15 'Referer': 'https://item.jd.com/1981570.html' 16 } 17 #發起請求 18 #request = urllib.request.Request(url=url, headers=headers) 19 #得到響應 20 content = requests.get(url=url,headers=headers).content.decode('gbk') 21 #去掉多余得到json格式 22 content = content.strip('fetchJSON_comment98vv995();') 23 print(content)
2.對數據進行清洗和處理
保存商品評論數據.csv:
1 def jd_crawl_comment(item_id,pagenum): 2 3 list_ =[] 4 start_page = 1 5 end_page = pagenum 6 for p in range(start_page, end_page + 1): 7 print(p) 8 #productid = 3048505 9 url = 'https://club.jd.com/comment/productPageComments.action?callback=fetchJSON_comment98vv995&productId='+str(item_id)+'&score=0&sortType=5&page={}&pageSize=10&isShadowSku=0&fold=1' 10 url = url.format(p) 11 print(url) 12 #仿造請求頭,騙過瀏覽器 13 headers = { 14 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36', 15 'Referer': 'https://item.jd.com/1981570.html' 16 } 17 #發起請求 18 #request = urllib.request.Request(url=url, headers=headers) 19 #得到響應 20 content = requests.get(url=url,headers=headers).content.decode('gbk') 21 #去掉多余得到json格式 22 content = content.strip('fetchJSON_comment98vv995();') 23 print(content) 24 obj = json.loads(content) 25 26 comments = obj['comments'] 27 print(comments) 28 fp = open('jingdong.txt', 'a', encoding='gbk') 29 for comment in comments: 30 # print(comment) 31 name = comment['referenceName'] 32 33 id = comment['id'] 34 35 con = comment['content'] 36 37 creationTime = comment['creationTime'] 38 39 img_url = comment['userImageUrl'] 40 score = comment['score'] 41 item = { 42 'name': name, 43 'id': id, 44 'con': con, 45 'time': creationTime, 46 'img_url': img_url, 47 } 48 string = str(item) 49 50 print(id, con, score, creationTime) 51 list_.append([id, con, score, creationTime]) 52 fp.write(string + '\n') 53 54 fp.close() 55 print('%s-page---finish' % p) 56 #print(list_) 57 time.sleep(5) 58 return list_ 59 def write_to_csv(list_,header,outputfile): 60 with open(outputfile, 'w', encoding='utf8', newline='') as f: 61 writer = csv.writer(f) 62 writer.writerow(header) 63 for l in list_: 64 writer.writerow(l) 65 f.close()
3.文本分析
生成商品評論數據.csv詞雲圖:
1 def wordcloud_京東_商品評論(filename,output): 2 # 生成詞雲的方法 3 f = open(filename,encoding='utf8') 4 data = pd.read_csv(f) 5 6 string = '' 7 comments = data['評論'] 8 for c in comments: 9 string += str(c) + '\n' 10 print(string) 11 cut = " ".join(jieba.cut(string)) 12 # stopwords remove 13 stopwords = ['其他', '很多', '不是', '非常', '這個', '那個', '真的', '可以', '沒有'] 14 for word in stopwords: 15 cut = cut.replace(word,'') 16 d = path.dirname(__file__) 17 18 cloud = WordCloud( # 設置字體,不指定就會出現亂碼 19 scale=4, 20 font_path=path.join(d, 'simhei.ttf'), 21 background_color='white', 22 max_words=2000, 23 max_font_size=1550, 24 min_font_size=2, 25 collocations=False, 26 relative_scaling=0.4, 27 prefer_horizontal=1) 28 wc = cloud.generate_from_text(cut) 29 wc.to_file(output)
運行結果如下:
4.數據分析與可視化(例如:數據柱形圖、直方圖、散點圖、盒圖、分布圖)
整理數據得出十大關鍵詞:
1 def 詞頻_京東商品評論_柱狀圖(inputfile, topN): 2 # 柱狀圖顯示詞頻前N 3 f = open(inputfile, encoding='utf8') 4 data = pd.read_csv(f) 5 6 string = '' 7 comments = data['評論'] 8 for c in comments: 9 string += str(c) + '\n' 10 print(string) 11 12 txt = string 13 words = jieba.lcut(txt) 14 dic_ = {} #創建字典 15 stopwords = ['其他', '很多', '不是', '非常', '這個', '那個', '真的', '可以', '沒有',"就是","這里"] 16 17 for word in words: 18 if len(word) == 1: 19 continue 20 else: 21 rword = word 22 dic_[rword] = dic_.get(rword,0) + 1 23 for word in stopwords: 24 try: 25 del(dic_[word]) 26 except: 27 pass 28 items = list(dic_.items()) 29 # 排序 按詞頻率 30 items.sort(key=lambda x:x[1], reverse =True) 31 32 labels = [] 33 sizes = [] 34 n = topN 35 wordlist = list() 36 for i in range(n): 37 word,count = items[i] 38 labels.append(word) 39 sizes.append(count) 40 wordlist.append(word) 41 42 plt.rcParams['font.sans-serif'] = ['SimHei'] 43 plt.rcParams['axes.unicode_minus'] = False 44 f, ax = plt.subplots() 45 # 橫向圖 46 rect = ax.barh(range(len(sizes)), sizes, tick_label='') 47 plt.yticks(np.arange(len(labels)), labels) 48 plt.legend((rect,), ("前十關鍵詞",)) 49 plt.savefig('前十關鍵詞.jpg') 50 return labels,sizes
5.根據數據之間的關系,分析產品評價數量和時間之間的相關系數,畫出散點圖。
def 評價_時間_分布散點圖(inputfile,output,title,columns): if inputfile.endswith('.xlsx') or inputfile.endswith('.xls'): data = pd.read_excel(inputfile) else: f = open(inputfile, encoding='utf8') data = pd.read_csv(f) timing = data['時間'] dic_ = {} for t in timing: print(t) t = t.split(' ')[0] t = t.split('-') month = t[0]+'-'+t[1] # 從具體時間中 提取出月份的數據 if month not in dic_.keys(): dic_[month] = 1 else: dic_[month] +=1 plt.scatter(dic_.keys(),dic_.values(),alpha=0.8,marker='o') plt.rcParams['font.sans-serif'] = 'simhei' plt.title(title) plt.xlabel('月份') plt.xticks(rotation='vertical') plt.ylabel('評價數量') plt.savefig(output) #plt.show()
運行結果如下:
6.數據持久化
7.將以上各部分的代碼匯總,附上完整程序代碼
1 import requests 2 import json 3 import csv 4 import time 5 def jd_crawl_comment(item_id,pagenum): 6 7 list_ =[] 8 start_page = 1 9 end_page = pagenum 10 for p in range(start_page, end_page + 1): 11 print(p) 12 #productid = 3048505 13 url = 'https://club.jd.com/comment/productPageComments.action?callback=fetchJSON_comment98vv995&productId='+str(item_id)+'&score=0&sortType=5&page={}&pageSize=10&isShadowSku=0&fold=1' 14 url = url.format(p) 15 print(url) 16 #仿造請求頭,騙過瀏覽器 17 headers = { 18 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36', 19 'Referer': 'https://item.jd.com/1981570.html' 20 } 21 #發起請求 22 #request = urllib.request.Request(url=url, headers=headers) 23 #得到響應 24 content = requests.get(url=url,headers=headers).content.decode('gbk') 25 #去掉多余得到json格式 26 content = content.strip('fetchJSON_comment98vv995();') 27 print(content) 28 obj = json.loads(content) 29 30 comments = obj['comments'] 31 print(comments) 32 fp = open('jingdong.txt', 'a', encoding='gbk') 33 for comment in comments: 34 # print(comment) 35 name = comment['referenceName'] 36 37 id = comment['id'] 38 39 con = comment['content'] 40 41 creationTime = comment['creationTime'] 42 43 img_url = comment['userImageUrl'] 44 score = comment['score'] 45 item = { 46 'name': name, 47 'id': id, 48 'con': con, 49 'time': creationTime, 50 'img_url': img_url, 51 } 52 string = str(item) 53 54 print(id, con, score, creationTime) 55 list_.append([id, con, score, creationTime]) 56 fp.write(string + '\n') 57 58 fp.close() 59 print('%s-page---finish' % p) 60 #print(list_) 61 time.sleep(5) 62 return list_ 63 def write_to_csv(list_,header,outputfile): 64 with open(outputfile, 'w', encoding='utf8', newline='') as f: 65 writer = csv.writer(f) 66 writer.writerow(header) 67 for l in list_: 68 writer.writerow(l) 69 f.close() 70 71 if __name__ == "__main__": 72 list_ = jd_crawl_comment(100009464799,pagenum=20) 73 outputfile = '京東商品評論數據.csv' 74 header = ['id','評論','打分','時間'] 75 write_to_csv(list_,header,outputfile) 76 77 import pandas as pd 78 from os import path 79 import jieba 80 import numpy as np 81 import matplotlib.pyplot as plt 82 from wordcloud import WordCloud 83 84 def wordcloud_京東_商品評論(filename,output): 85 # 生成詞雲的方法 86 f = open(filename,encoding='utf8') 87 data = pd.read_csv(f) 88 89 string = '' 90 comments = data['評論'] 91 for c in comments: 92 string += str(c) + '\n' 93 print(string) 94 cut = " ".join(jieba.cut(string)) 95 # stopwords remove 96 stopwords = ['其他', '很多', '不是', '非常', '這個', '那個', '真的', '可以', '沒有'] 97 for word in stopwords: 98 cut = cut.replace(word,'') 99 d = path.dirname(__file__) 100 101 cloud = WordCloud( # 設置字體,不指定就會出現亂碼 102 scale=4, 103 font_path=path.join(d, 'simhei.ttf'), 104 background_color='white', 105 max_words=2000, 106 max_font_size=1550, 107 min_font_size=2, 108 collocations=False, 109 relative_scaling=0.4, 110 prefer_horizontal=1) 111 wc = cloud.generate_from_text(cut) 112 wc.to_file(output) 113 114 115 116 117 def 詞頻_京東商品評論_柱狀圖(inputfile, topN): 118 # 柱狀圖顯示詞頻前N 119 f = open(inputfile, encoding='utf8') 120 data = pd.read_csv(f) 121 122 string = '' 123 comments = data['評論'] 124 for c in comments: 125 string += str(c) + '\n' 126 print(string) 127 128 txt = string 129 words = jieba.lcut(txt) 130 dic_ = {} #創建字典 131 stopwords = ['其他', '很多', '不是', '非常', '這個', '那個', '真的', '可以', '沒有',"就是","這里"] 132 133 for word in words: 134 if len(word) == 1: 135 continue 136 else: 137 rword = word 138 dic_[rword] = dic_.get(rword,0) + 1 139 for word in stopwords: 140 try: 141 del(dic_[word]) 142 except: 143 pass 144 items = list(dic_.items()) 145 # 排序 按詞頻率 146 items.sort(key=lambda x:x[1], reverse =True) 147 148 labels = [] 149 sizes = [] 150 n = topN 151 wordlist = list() 152 for i in range(n): 153 word,count = items[i] 154 labels.append(word) 155 sizes.append(count) 156 wordlist.append(word) 157 158 plt.rcParams['font.sans-serif'] = ['SimHei'] 159 plt.rcParams['axes.unicode_minus'] = False 160 f, ax = plt.subplots() 161 # 橫向圖 162 rect = ax.barh(range(len(sizes)), sizes, tick_label='') 163 plt.yticks(np.arange(len(labels)), labels) 164 plt.legend((rect,), ("前十關鍵詞",)) 165 plt.savefig('前十關鍵詞.jpg') 166 return labels,sizes 167 def 評價_時間_分布散點圖(inputfile,output,title,columns): 168 if inputfile.endswith('.xlsx') or inputfile.endswith('.xls'): 169 data = pd.read_excel(inputfile) 170 else: 171 f = open(inputfile, encoding='utf8') 172 data = pd.read_csv(f) 173 timing = data['時間'] 174 dic_ = {} 175 for t in timing: 176 177 print(t) 178 t = t.split(' ')[0] 179 t = t.split('-') 180 month = t[0]+'-'+t[1] 181 # 從具體時間中 提取出月份的數據 182 if month not in dic_.keys(): 183 dic_[month] = 1 184 else: 185 dic_[month] +=1 186 plt.scatter(dic_.keys(),dic_.values(),alpha=0.8,marker='o') 187 plt.rcParams['font.sans-serif'] = 'simhei' 188 plt.title(title) 189 190 plt.xlabel('月份') 191 plt.xticks(rotation='vertical') 192 plt.ylabel('評價數量') 193 194 plt.savefig(output) 195 #plt.show() 196 197 if __name__=='__main__': 198 filename = '京東商品評論數據.csv' 199 評價_時間_分布散點圖(filename, '打分分布散點圖(月份)', '打分分布散點圖(月份)', []) 200 wordcloud_京東_商品評論(filename, filename + '詞雲.jpg') 201 詞頻_京東商品評論_柱狀圖(filename, 10)
(五)、總結(10 分)
1.經過對主題數據的分析與可視化,可以得到哪些結論?是否達到預期的目標?
2.在完成此設計過程中,得到哪些收獲?以及要改進的建議?