一、選題背景
現如今,音樂伴隨着我們的生活。已經成為了我們的生活習慣一部分,聽音樂的場景有很多,例如做作業、上班出勤的路上、午休小寢時……為此我們都會對最近新出的流行歌曲和熱度飆升比較快的歌曲感興趣。為此我選這個主題作為課題。
二、爬蟲方案設計
名稱:QQ音樂流行、飆升排行榜數據爬取
內容:通過訪問QQ音樂的web官網,爬取相對應榜單的信息。最后保存下來做可視化分析。
設計方案思路:
首先,用request進行訪問頁面。
其次,用xtree來獲取頁面內容。
最后,文件操作進行數據的保存。
技術難點:在於分排行榜單進行爬取,工程量較大。
三、頁面結構特征分析
頁面分析:
內容導航型
爬取目標特征分析(Htmls分析):
排名、熱度、歌曲時間:
歌曲名:
歌手:
節點查找方法:
QQ_muc_pop = html.xpath("//*[@id='app']/div/div[2]/div[2]/div[3]/ul[2]/li[{}]/div/div[1]/text()".format(pop)) QQ_muc_up = html.xpath("//*[@id='app']/div/div[2]/div[2]/div[3]/ul[2]/li[{}]/div/div[2]/text()".format(pop)) QQ_muc_name = html.xpath("//*[@id='app']/div/div[2]/div[2]/div[3]/ul[2]/li[{}]/div/div[3]/span/a[2]/text()".format(pop)) QQ_muc_singer = html.xpath("//*[@id='app']/div/div[2]/div[2]/div[3]/ul[2]/li[{}]/div/div[4]/a/text()".format(pop)) QQ_muc_time = html.xpath("//*[@id='app']/div/div[2]/div[2]/div[3]/ul[2]/li[{}]/div/div[5]/text()".format(pop))
遍歷方法:通過for循環控制標簽導航,進行遍歷篩出
四、爬蟲程序設計
數據爬取與采集:
1 import requests 2 from bs4 import BeautifulSoup 3 import time 4 import random 5 import sys 6 import re 7 from tqdm import tqdm 8 from lxml import etree 9 10 USER_AGENTS = [ 11 'Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1667.0 Safari/537.36' 12 'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.16 Safari/537.36' 13 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1623.0 Safari/537.36' 14 'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.17 Safari/537.36' 15 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.62 Safari/537.36' 16 'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.2 Safari/537.36' 17 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1468.0 Safari/537.36' 18 'Mozilla/5.0 (Windows NT 6.2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1467.0 Safari/537.36' 19 'Mozilla/5.0 (Windows NT 6.2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1464.0 Safari/537.36' 20 'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1500.55 Safari/537.36' 21 'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36' 22 'Mozilla/5.0 (Windows NT 6.1; rv:27.3) Gecko/20130101 Firefox/27.3' 23 'Mozilla/5.0 (Windows NT 6.2; Win64; x64; rv:27.0) Gecko/20121011 Firefox/27.0' 24 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:25.0) Gecko/20100101 Firefox/25.0' 25 'Mozilla/5.0 (Windows NT 6.0; WOW64; rv:24.0) Gecko/20100101 Firefox/24.0' 26 'Mozilla/5.0 (Windows NT 6.2; rv:22.0) Gecko/20130405 Firefox/23.0' 27 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20130406 Firefox/23.0' 28 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:23.0) Gecko/20131011 Firefox/23.0' 29 'Mozilla/5.0 (Windows NT 6.2; rv:22.0) Gecko/20130405 Firefox/22.0' 30 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:22.0) Gecko/20130328 Firefox/22.0' 31 'Mozilla/5.0 (Windows NT 6.1; rv:22.0) Gecko/20130405 Firefox/22.0' 32 'Mozilla/5.0 (Microsoft Windows NT 6.2.9200.0); rv:22.0) Gecko/20130405 Firefox/22.0' 33 'Mozilla/5.0 (Windows NT 6.2; Win64; x64; rv:16.0.1) Gecko/20121011 Firefox/21.0.1' 34 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:16.0.1) Gecko/20121011 Firefox/21.0.1' 35 'Mozilla/5.0 (Windows NT 6.2; Win64; x64; rv:21.0.0) Gecko/20121011 Firefox/21.0.0' 36 'Mozilla/5.0 (Windows NT 6.2; WOW64; rv:21.0) Gecko/20130514 Firefox/21.0' 37 ] 38 39 headers = { 40 'User-Agent':random.choice(USER_AGENTS), 41 # 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0', 42 'Connection':'keep-alive', 43 'Accept-Language':'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2' 44 } 45 46 # QQ音樂飆升榜 47 def QQ_muc_up(): 48 # 請求訪問 49 url = 'https://y.qq.com/n/ryqq/toplist/62' 50 res = requests.get(url,headers=headers) 51 res.encoding = 'utf-8' 52 # print(res) 53 html = etree.HTML(res.text) 54 # print(html) 55 # 創建保存文件 56 # 創建文件 57 file = open("QQ_muc_up.csv", "a") 58 file.write( 59 "QQ_muc_pop" + "," + "QQ_muc_up" + "," + "QQ_muc_name" + "," + "QQ_muc_singer" + "," + "QQ_muc_time" + '\n') 60 file = file.close() 61 # 排名QQ_muc_pop、飆升指數QQ_muc_up、歌名QQ_muc_name、歌手QQ_muc_singer、歌曲時間QQ_muc_time 62 pop = 1 63 for i in range(1,21): 64 QQ_muc_pop = html.xpath("//*[@id='app']/div/div[2]/div[2]/div[3]/ul[2]/li[{}]/div/div[1]/text()".format(pop)) 65 for item in QQ_muc_pop: 66 QQ_muc_pop = item 67 QQ_muc_up = html.xpath("//*[@id='app']/div/div[2]/div[2]/div[3]/ul[2]/li[{}]/div/div[2]/text()".format(pop)) 68 for item in QQ_muc_up: 69 QQ_muc_up = item.strip('%') 70 QQ_muc_up = int(QQ_muc_up) 71 QQ_muc_name = html.xpath("//*[@id='app']/div/div[2]/div[2]/div[3]/ul[2]/li[{}]/div/div[3]/span/a[2]/text()".format(pop)) 72 for item in QQ_muc_name: 73 QQ_muc_name = item 74 QQ_muc_singer = html.xpath("//*[@id='app']/div/div[2]/div[2]/div[3]/ul[2]/li[{}]/div/div[4]/a/text()".format(pop)) 75 for item in QQ_muc_singer: 76 QQ_muc_singer = item 77 QQ_muc_time = html.xpath("//*[@id='app']/div/div[2]/div[2]/div[3]/ul[2]/li[{}]/div/div[5]/text()".format(pop)) 78 for item in QQ_muc_time: 79 QQ_muc_time = item 80 pop += 1 81 # 保存數據 82 with open('QQ_muc_up.csv', "a", encoding='utf-8') as file1: 83 file1.writelines(QQ_muc_pop + "," + str(QQ_muc_up) + "," + QQ_muc_name + "," + QQ_muc_singer + "," + QQ_muc_time + '\n') 84 print('歌名:',QQ_muc_name,'\n','排名:',QQ_muc_pop,'\n','飆升指數:',QQ_muc_up,'\n','歌手名:',QQ_muc_singer,'\n','時長',QQ_muc_time) 85 86 #QQ音樂流行榜 87 def QQ_muc_fasion(): 88 # 請求訪問 89 url = 'https://y.qq.com/n/ryqq/toplist/4' 90 res = requests.get(url,headers=headers) 91 res.encoding = 'utf-8' 92 # print(res) 93 html = etree.HTML(res.text) 94 # print(html) 95 # 創建保存文件 96 # 創建文件 97 file = open("QQ_muc_fasion.csv", "a") 98 file.write( 99 "QQ_muc_pop" + "," + "QQ_muc_up" + "," + "QQ_muc_name" + "," + "QQ_muc_singer" + "," + "QQ_muc_time" + '\n') 100 file = file.close() 101 # 排名QQ_muc_pop、飆升指數QQ_muc_up、歌名QQ_muc_name、歌手QQ_muc_singer、歌曲時間QQ_muc_time 102 pop = 1 103 for i in range(1,21): 104 QQ_muc_pop = html.xpath("//*[@id='app']/div/div[2]/div[2]/div[3]/ul[2]/li[{}]/div/div[1]/text()".format(pop)) 105 for item in QQ_muc_pop: 106 QQ_muc_pop = item 107 QQ_muc_up = html.xpath("//*[@id='app']/div/div[2]/div[2]/div[3]/ul[2]/li[{}]/div/div[2]/text()".format(pop)) 108 for item in QQ_muc_up: 109 QQ_muc_up = item.strip('%') 110 QQ_muc_up = int(QQ_muc_up) 111 QQ_muc_name = html.xpath("//*[@id='app']/div/div[2]/div[2]/div[3]/ul[2]/li[{}]/div/div[3]/span/a[2]/text()".format(pop)) 112 for item in QQ_muc_name: 113 QQ_muc_name = item 114 QQ_muc_singer = html.xpath("//*[@id='app']/div/div[2]/div[2]/div[3]/ul[2]/li[{}]/div/div[4]/a/text()".format(pop)) 115 for item in QQ_muc_singer: 116 QQ_muc_singer = item 117 QQ_muc_time = html.xpath("//*[@id='app']/div/div[2]/div[2]/div[3]/ul[2]/li[{}]/div/div[5]/text()".format(pop)) 118 for item in QQ_muc_time: 119 QQ_muc_time = item 120 pop += 1 121 # 保存數據 122 with open('QQ_muc_fasion.csv', "a", encoding='utf-8') as file1: 123 file1.writelines( 124 QQ_muc_pop + "," + str(QQ_muc_up) + "," + QQ_muc_name + "," + QQ_muc_singer + "," + QQ_muc_time + '\n') 125 print('歌名:',QQ_muc_name,'\n','排名:',QQ_muc_pop,'\n','飆升指數:',QQ_muc_up,'\n','歌手名:',QQ_muc_singer,'\n','時長',QQ_muc_time) 126 127 if __name__ == '__main__': 128 print('-------------------start----------------------') 129 print('正在爬取QQ音樂飆升榜單') 130 QQ_muc_up() 131 print('-------------------分界線----------------------') 132 print('正在爬取QQ音樂流行榜單') 133 QQ_muc_fasion() 134 print('--------------------end------------------------')
數據清洗:
import pandas as pd import numpy as np Fasion = pd.read_csv(r'D:\HW\QQ_muc_fasion.csv',error_bad_lines=False) Up = pd.read_csv(r'D:\HW\QQ_muc_up.csv',error_bad_lines=False) Fasion
# 重復值處理 Fasion = Fasion.drop_duplicates() Up = Up.drop_duplicates() # Nan處理 Fasion = Fasion.dropna(axis = 0) Up = Up.dropna(axis = 0) # 刪除無效行 del Up['QQ_muc_time'] del Fasion['QQ_muc_time']
import matplotlib.pyplot as plt # 可視化分析 # y的點擊數單位為萬 x = Fasion['QQ_muc_name'] y = Fasion['QQ_muc_up'] z = Up['QQ_muc_up'] plt.rcParams['font.sans-serif']=['SimHei'] #用來正常顯示中文標簽 plt.rcParams['axes.unicode_minus']=False plt.plot(x,y,'-',color = 'r',label="熱度") plt.xticks(rotation=90) plt.legend(loc = "best")#圖例 plt.title("QQ音樂流行榜單趨勢圖") plt.xlabel("歌曲名",)#橫坐標名字 plt.ylabel("熱度")#縱坐標名字 plt.show()
plt.rcParams['font.sans-serif']=['SimHei'] #用來正常顯示中文標簽 plt.rcParams['axes.unicode_minus']=False plt.plot(x,z,'-',color = 'b',label="熱度") plt.xticks(rotation=90) plt.legend(loc = "best")#圖例 plt.title("QQ音樂飆升榜趨勢圖") plt.xlabel("歌曲名",)#橫坐標名字 plt.ylabel("熱度")#縱坐標名字 plt.show()
# 柱狀圖 plt.bar(x,y,alpha=0.2, width=0.4, color='b', lw=3) plt.rcParams['font.sans-serif']=['SimHei'] #用來正常顯示中文標簽 plt.title("QQ音樂流行榜單柱狀圖") plt.xticks(rotation=90) plt.xlabel("歌曲名",)#橫坐標名字 plt.ylabel("熱度")#縱坐標名字 plt.show()
# 柱狀圖 plt.bar(x,z,alpha=0.2, width=0.4, color='g', lw=3) plt.rcParams['font.sans-serif']=['SimHei'] #用來正常顯示中文標簽 plt.title("QQ音樂飆升榜單柱狀圖") plt.xticks(rotation=90) plt.xlabel("歌曲名",)#橫坐標名字 plt.ylabel("熱度")#縱坐標名字 plt.show()
# 水平圖 plt.barh(x,y, alpha=0.2, height=0.4, color='y',label="熱度指數", lw=3) plt.title("QQ音樂流行榜單水平圖") plt.legend(loc = "best")#圖例 plt.xlabel("熱度",)#橫坐標名字 plt.ylabel("歌曲名")#縱坐標名字 plt.show()
# 水平圖 plt.barh(x,z, alpha=0.2, height=0.4, color='pink',label="熱度指數", lw=3) plt.title("QQ音樂飆升榜單水平圖") plt.legend(loc = "best")#圖例 plt.xlabel("熱度",)#橫坐標名字 plt.ylabel("歌曲名")#縱坐標名字 plt.show()
# 散點圖 plt.scatter(x,y,color='pink',marker='o',s=40,edgecolor='black',alpha=0.5) plt.xticks(rotation=90) plt.title("QQ音樂流行榜單散點圖") plt.xlabel("歌曲名",)#橫坐標名字 plt.ylabel("熱度")#縱坐標名字 plt.show()
# 散點圖 plt.scatter(x,z,color='gray',marker='o',s=40,edgecolor='black',alpha=0.5) plt.xticks(rotation=90) plt.title("QQ音樂飆升榜單散點圖") plt.xlabel("歌曲名",)#橫坐標名字 plt.ylabel("熱度")#縱坐標名字 plt.show()
# 盒圖 plt.boxplot(z) plt.title("QQ音樂飆升榜單量盒圖") plt.show()
# 盒圖 plt.boxplot(y) plt.title("QQ音樂流行榜單量盒圖") plt.show()
雲詞:
import pandas as pd import numpy as np import wordcloud as wc from PIL import Image import matplotlib.pyplot as plt bk = np.array(Image.open("QQ.jpg")) mask = bk Fasion = pd.read_csv(r'D:\HW\QQ_muc_fasion.csv',error_bad_lines=False) Up = pd.read_csv(r'D:\HW\QQ_muc_up.csv',error_bad_lines=False) word_cloud = wc.WordCloud( width=1000, # 詞雲圖寬 height=1000, # 詞雲圖高 mask = mask, background_color='white', # 詞雲圖背景顏色,默認為白色 font_path='msyhbd.ttc', # 詞雲圖 字體(中文需要設定為本機有的中文字體) max_font_size=400, # 最大字體,默認為200 random_state=50, # 為每個單詞返回一個PIL顏色 ) text = Fasion['QQ_muc_singer'] Fasion = [] for i in text: Fasion.append(i) text = " ".join(Fasion) word_cloud.generate(text) plt.imshow(word_cloud) plt.show()
text = Up['QQ_muc_singer'] Up = [] for i in text: Up.append(i) text = " ".join(Up) word_cloud.generate(text) plt.imshow(word_cloud) plt.show()
五、總結
1.經過對主題數據的分析與可視化,可以得到哪些結論?是否達到預期的目標?
經過此次對主題數據分析與可視化可以得出,通過歌曲熱度可以了解到最新排行榜的詳情。已達到預期目標
2.在完成此設計過程中,得到哪些收獲?以及要改進的建議?
在設計過程中,我收獲到了如何編寫爬蟲程序,如何把網頁想要爬取的內容提取出來,雲詞怎么畫出來。改進的地方可能是編寫代碼經驗不足吧,寫代碼的時候比較吃力。希望我自己在以后的就業或者提升中花費多些時間、精力來提升這一短板。