一、選題背景
現如今社會上,我們常見得購物方式大多數都是通過網上購物來解決日常剛需。尤其是80,90,00后日常生活已經離不開這樣子得生活方式。但是在碰到買家電得時候我們通常會去、蘇寧易購、京東這些平台買。但是自己又不知道價格行情怎么樣。我在上網瀏覽得時候發現“什么值得買”這個平台集合了所有購物平台的價格行情集合。所以選擇此課題進行可視化數據分析。
二、網絡爬蟲設計方案
名稱:什么值得買大家電數據爬蟲
內容:通過request解析頁面,爬取各大大家電的數據。
設計方案描述:
1、請求:
request請求
xtree解析
2、爬取數據
xtree.xpath爬取內容
3、數據保存
使用sys進行文件操作
難點:爬取的內容不干凈還得做字符處理。
三、結構特征分析
內容導航型
商品名稱:
價格:
商品介紹:
購買平台:
節點查找:
name = html.xpath("//*[@id='feed-main-list']/li[{}]/div/div[2]/h5/a/text()".format(coun)) price = html.xpath("//*[@id='feed-main-list']/li[{}]/div/div[2]/div[1]/a/text()".format(coun)) platform = html.xpath("//*[@id='feed-main-list']/li[{}]/div/div[2]/div[3]/div[2]/span/a/text()".format(coun)) info = html.xpath("//*[@id='feed-main-list']/li[{}]/div/div[2]/div[2]/text()[1]".format(coun))
遍歷:for循環遍歷出來
四、程序設計
數據爬取:
1 import requests 2 from bs4 import BeautifulSoup 3 import time 4 import random 5 import sys 6 import re 7 from tqdm import tqdm 8 from lxml import etree 9 10 11 # 隨機頭 12 USER_AGENTS = [ 13 "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)", 14 "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)", 15 "Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)", 16 "Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)", 17 "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)", 18 "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)", 19 "Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)", 20 "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)", 21 "Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6", 22 "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1", 23 "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0", 24 "Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5", 25 "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6", 26 "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11", 27 "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20", 28 "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52", 29 "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/2.0 Safari/536.11", 30 "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER", 31 "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; LBBROWSER)", 32 "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E; LBBROWSER)", 33 "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 LBBROWSER", 34 ] 35 headers = { 36 'User-Agent':random.choice(USER_AGENTS), 37 'Connection':'keep-alive', 38 'Accept-Language':'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2' 39 } 40 41 # 創建Jiadian.csv 42 file = open("Jiadian.csv", "a") 43 file.write("name" + "," + "price" + "," + "info" + "," + "platform" + '\n') 44 file = file.close() 45 46 def jiadian(page): 47 for i in range(0,page): 48 page+=1 49 url = 'https://www.smzdm.com/fenlei/dajiadian/p'+str(page)+'/#feed-main' 50 res = requests.get(url,headers=headers) 51 res.encoding = 'utf-8' 52 html = etree.HTML(res.text) 53 # 家電名稱name、價格price、商品簡介info、出售平台platform 54 coun = 1 55 coun1 = 3 56 for i in range(30): 57 try: 58 name = html.xpath("//*[@id='feed-main-list']/li[{}]/div/div[2]/h5/a/text()".format(coun)) 59 for i in name: 60 name = i 61 price = html.xpath("//*[@id='feed-main-list']/li[{}]/div/div[2]/div[1]/a/text()".format(coun)) 62 for i in price: 63 price = i.strip() 64 price = price.strip('(需用券)') 65 price = price.strip('元包郵 (需用券)') 66 price = price.strip('元包郵(雙重優惠') 67 price = price.strip('元包郵(拍下立減') 68 price = price.strip('元') 69 price = price.strip('元(包郵、') 70 71 info = html.xpath("//*[@id='feed-main-list']/li[{}]/div/div[2]/div[2]/text()[1]".format(coun)) 72 for i in info: 73 info = i.strip() 74 #g購買平台內容篩出 75 platform = html.xpath("//*[@id='feed-main-list']/li[{}]/div/div[2]/div[3]/div[2]/span/a/text()".format(coun)) 76 for i in platform: 77 platform = i.strip() 78 79 # 將數據保存至Jiadian.csv文件 80 with open("Jiadian.csv","a",encoding='utf-8') as f2: 81 f2.writelines(name + "," + price + "," + platform + "," + info + "," + '\n') 82 print(name,'\n','價格:',price,'元','\n','簡介:',info,'\n','購買平台:',platform,'\n') 83 coun += 1 84 85 except: 86 pass 87 time.sleep(1) 88 page+=1 89 90 if __name__ == '__main__': 91 page = 100 92 jiadian(page)
運行圖片:
數據清洗處理:
import pandas as pd import numpy as np JD = pd.read_csv(r'D:\Hxt\Jiadian.csv',error_bad_lines=False) JD.head(20)
# 重復值處理 JD = JD.drop_duplicates('name') JD.head(20)
# 刪除無效列 del JD['Unnamed: 4'] del JD['Unnamed: 5'] del JD['Unnamed: 6'] del JD['Unnamed: 7'] del JD['Unnamed: 8'] del JD['Unnamed: 9'] del JD['Unnamed: 10']
# Nan處理 JD = JD.dropna(axis = 0,how='any')
可視化分析:
import matplotlib.pyplot as plt # 可視化分析 # y的點擊數單位為萬 x = JD['name'].head(20) y = JD['price'].head(20) plt.rcParams['font.sans-serif']=['SimHei'] #用來正常顯示中文標簽 plt.rcParams['axes.unicode_minus']=False plt.plot(x,y,'-.',color = 'y',label="點擊量 單位/萬") plt.xticks(rotation=90) plt.legend(loc = "best")#圖例 plt.title("家電價格趨勢圖") plt.xlabel("家電",)#橫坐標名字 plt.ylabel("價格")#縱坐標名字 plt.show()
# 柱狀圖 plt.bar(x,y,alpha=0.2, width=0.4, color='', lw=3) plt.rcParams['font.sans-serif']=['SimHei'] #用來正常顯示中文標簽 plt.title("家電價格柱狀圖") plt.xticks(rotation=90) plt.xlabel("家電",)#橫坐標名字 plt.ylabel("價格")#縱坐標名字 plt.show()
# 水平圖 plt.barh(x,y, alpha=0.2, height=0.4, color='b',label="價格 單位/元", lw=3) plt.title("家電價格水平圖") plt.legend(loc = "best")#圖例 plt.xlabel("家電",)#橫坐標名字 plt.ylabel("價格")#縱坐標名字 plt.show()
# 散點圖 plt.scatter(x,y,color='pink',marker='o',s=40,edgecolor='black',alpha=0.5) plt.xticks(rotation=90) plt.title("家電價格散點圖") plt.xlabel("家電",)#橫坐標名字 plt.ylabel("價格")#縱坐標名字 plt.show()
# 盒圖 plt.boxplot(y) plt.title("家電價格量盒圖") plt.show()
# 雲詞 import pandas as pd import numpy as np import wordcloud as wc from PIL import Image import matplotlib.pyplot as plt bk = np.array(Image.open("JD.jpg")) mask = bk JD = pd.read_csv(r'D:\Hxt\Jiadian.csv',error_bad_lines=False) word_cloud = wc.WordCloud( width=1000, # 詞雲圖寬 height=1000, # 詞雲圖高 mask = mask, background_color='white', # 詞雲圖背景顏色,默認為白色 font_path='msyhbd.ttc', # 詞雲圖 字體(中文需要設定為本機有的中文字體) max_font_size=400, # 最大字體,默認為200 random_state=50, # 為每個單詞返回一個PIL顏色 ) text = JD['info'] JD = [] for i in text: JD.append(i) text = " ".join(JD) word_cloud.generate(text) plt.imshow(word_cloud) plt.show()
總代碼:
1 import pandas as pd 2 import numpy as np 3 4 JD = pd.read_csv(r'D:\Hxt\Jiadian.csv',error_bad_lines=False) 5 JD.head(20) 6 # 重復值處理 7 JD = JD.drop_duplicates('name') 8 JD.head(20) 9 # 刪除無效列 10 del JD['Unnamed: 4'] 11 del JD['Unnamed: 5'] 12 del JD['Unnamed: 6'] 13 del JD['Unnamed: 7'] 14 del JD['Unnamed: 8'] 15 del JD['Unnamed: 9'] 16 del JD['Unnamed: 10'] 17 # Nan處理 18 JD = JD.dropna(axis = 0,how='any') 19 import matplotlib.pyplot as plt 20 # 可視化分析 21 # y的點擊數單位為萬 22 x = JD['name'].head(20) 23 y = JD['price'].head(20) 24 plt.rcParams['font.sans-serif']=['SimHei'] #用來正常顯示中文標簽 25 plt.rcParams['axes.unicode_minus']=False 26 plt.plot(x,y,'-.',color = 'y',label="點擊量 單位/萬") 27 plt.xticks(rotation=90) 28 plt.legend(loc = "best")#圖例 29 plt.title("家電價格趨勢圖") 30 plt.xlabel("家電",)#橫坐標名字 31 plt.ylabel("價格")#縱坐標名字 32 plt.show() 33 # 柱狀圖 34 plt.bar(x,y,alpha=0.2, width=0.4, color='', lw=3) 35 plt.rcParams['font.sans-serif']=['SimHei'] #用來正常顯示中文標簽 36 plt.title("家電價格柱狀圖") 37 plt.xticks(rotation=90) 38 plt.xlabel("家電",)#橫坐標名字 39 plt.ylabel("價格")#縱坐標名字 40 plt.show() 41 # 水平圖 42 plt.barh(x,y, alpha=0.2, height=0.4, color='b',label="價格 單位/元", lw=3) 43 plt.title("家電價格水平圖") 44 plt.legend(loc = "best")#圖例 45 plt.xlabel("家電",)#橫坐標名字 46 plt.ylabel("價格")#縱坐標名字 47 plt.show() 48 # 散點圖 49 plt.scatter(x,y,color='pink',marker='o',s=40,edgecolor='black',alpha=0.5) 50 plt.xticks(rotation=90) 51 plt.title("家電價格散點圖") 52 plt.xlabel("家電",)#橫坐標名字 53 plt.ylabel("價格")#縱坐標名字 54 plt.show() 55 # 盒圖 56 plt.boxplot(y) 57 plt.title("家電價格量盒圖") 58 plt.show() 59 # 雲詞 60 import pandas as pd 61 import numpy as np 62 import wordcloud as wc 63 from PIL import Image 64 import matplotlib.pyplot as plt 65 66 bk = np.array(Image.open("JD.jpg")) 67 mask = bk 68 JD = pd.read_csv(r'D:\Hxt\Jiadian.csv',error_bad_lines=False) 69 word_cloud = wc.WordCloud( 70 width=1000, # 詞雲圖寬 71 height=1000, # 詞雲圖高 72 mask = mask, 73 background_color='white', # 詞雲圖背景顏色,默認為白色 74 font_path='msyhbd.ttc', # 詞雲圖 字體(中文需要設定為本機有的中文字體) 75 max_font_size=400, # 最大字體,默認為200 76 random_state=50, # 為每個單詞返回一個PIL顏色 77 ) 78 text = JD['info'] 79 JD = [] 80 for i in text: 81 JD.append(i) 82 text = " ".join(JD) 83 84 word_cloud.generate(text) 85 plt.imshow(word_cloud) 86 plt.show()
五、總結
通過此次主題數據分析與可視化、可以看出來在大家電價格中電腦、烘干機、大電視。價格比較貴。分析結果達到預期、何以很明顯看出家電的不同價格。在此次設計過程中我收獲到了原來我們常見的詞語海報是雲圖所做,我對此非常感興趣、還去查閱了很多相關材料。不足之處的話,可能就是爬蟲了,爬取內容遇到太多坎,花費了很多時間才獲取到數據內容。