視頻地址 https://www.bilibili.com/bangumi/play/ss39462?spm_id_from=333.851.b_62696c695f7265706f72745f616e696d65.52 彈幕地址 固定的url地址 + 視頻的cid+.xml -- 源碼搜索cid 比如:https://comment.bilibili.com/428471132.xml
數據獲取部分
# 完整代碼 # 獲取數據 import requests from bs4 import BeautifulSoup import pandas as pd cid = 428471132 url = "https://comment.bilibili.com/{}.xml".format(cid) response = requests.get(url) response.encoding = "utf-8" #print(response.text) # 解析數據 soup = BeautifulSoup(response.text,"lxml") datas = soup.select('d') #print(datas[0]) # 獲取彈幕文字內容 comments = [data.text for data in datas] #print(comments) # 屬性信息 # 出現時間點 模式 字體 顏色 發送時間 彈幕詞 用戶ID rowID 等 info_comments = [data.get('p').split(',') for data in datas] # 獲取彈幕屬性信息 #print(info_comments) # 數據存儲 columns = ["出現時間點","模式","字體","顏色","發送時間","彈幕池","用戶ID","rowID","未知參數"] comment_datas = pd.DataFrame(info_comments,columns=columns) #print(comment_datas) # 數據組合 comment_datas["comments"] = comments #print(comment_datas) # 數據存儲 comment_datas.to_csv("comments.csv",encoding="utf-8-sig") print("finish...")
數據分析部分
一 繪制詞雲圖
# 加載數據
import pandas as pd
comment_datas = pd.read_csv("comments.csv",encoding="utf-8-sig")
print(comment_datas)
## 繪制詞雲圖
import jieba from tkinter import _flatten import matplotlib.pyplot as plt from wordcloud import WordCloud # 數據獲取 comments = comment_datas["comments"] # 分詞 jieba.load_userdict("hong.txt") # 加載用戶自定義詞典 comments_cut = comments.apply(jieba.lcut) # 對彈幕進行分詞 #print(comments_cut) # 去除停用詞 with open("stoplist.txt","r",encoding="utf-8") as f: stop_words = f.read() stop_words += "\n" stop_words += "●" comments_after = comments_cut.apply(lambda x:[i for i in x if i not in stop_words]) #print(comments_after) # 詞頻統計 results = _flatten(list(comments_after)) #print(results) word_count=pd.Series(results).value_counts() #print(word_count) # 繪制詞雲 https://tool.lu/cutout/ pic = plt.imread("aixin.jpg") # 讀取一張詞雲輪廓 word_cloud = WordCloud(mask=pic,background_color='white',font_path="C:\Windows\Fonts\simhei.ttf") word_cloud.fit_words(word_count) plt.imshow(word_cloud) plt.axis('off')
二 分析彈幕數量與日期,時間的關系
# 分析彈幕數量與日期,時間的關系 # 加載數據 import pandas as pd from datetime import datetime comment_datas = pd.read_csv("comments.csv",encoding="utf-8-sig") comment_datas["發送時間"] = comment_datas["發送時間"].apply(lambda x :datetime.fromtimestamp(x).strftime('%Y-%m-%d %H:%M:%S')) #print(comment_datas) # 分析彈幕數量與日期,時間的關系 userID = comment_datas["用戶ID"] #print(userID) # 每個用戶發送多少次彈幕 userID_count = comment_datas["用戶ID"].value_counts() #print(userID_count) # 求取發送次數彈幕的用戶量 userID_count_count = comment_datas["用戶ID"].value_counts().value_counts() #print(userID_count_count) # 排序依據大小排列 userID_count_count_sort = comment_datas["用戶ID"].value_counts().value_counts().sort_index() print(userID_count_count_sort) #num = userID_count_count_sort[:6] num = userID_count_count_sort[6:] #num.append(userID_count_count_sort[6:].sum()) print(num.sum()) ## 繪制條形圖 import matplotlib.pyplot as plt num = userID_count_count_sort[:6] plt.style.use('ggplot') plt.rcParams['font.sans-serif'] = 'SimHei' plt.bar(range(6),num) plt.xlabel("彈幕數量") plt.ylabel("用戶數量") plt.title("彈幕發布數量分布圖") plt.show() ## 彈幕數量隨時間變化圖 # 去除時分秒的影響 dates = pd.to_datetime(comment_datas["發送時間"]) dates = [date.date() for date in dates] dates = pd.Series(dates) num = dates.value_counts().sort_index() #print(date_counts) # 繪制折線圖 plt.figure(figsize=(16,9)) plt.plot(range(len(num)),num) #plt.xticks(range(len(num))[::7],num.index[::7],rotation=45) plt.xticks(range(len(num)),num.index,rotation=45) plt.ylabel("彈幕數量") plt.xlabel("日期變化") plt.title("彈幕發布數量隨日期變化圖") plt.show() ### 分析彈幕數量與日期,時間的關系 -- 以周為研究對象 import pandas as pd comment_datas = pd.read_csv("comments.csv",encoding="utf-8-sig") #comment_datas["發送時間"] comment_datas["發送時間"] = comment_datas["發送時間"].apply(lambda x :datetime.fromtimestamp(x).strftime('%Y-%m-%d %H:%M:%S')) dates = pd.to_datetime(comment_datas["發送時間"]) #print(dates) date = pd.Series(dates.dt.weekday) #print(date) date_count = date.value_counts().sort_index() #print(date_count) plt.figure(figsize=(16,9)) plt.plot(range(len(date_count)),date_count) plt.xticks(range(len(date_count)),["周日","周一","周二","周三","周四","周五","周六"],rotation=45) plt.ylabel("彈幕數量") plt.xlabel("日期變化") plt.title("彈幕發布數量隨日期變化圖") plt.show()
