抓取 Bilibili 彈幕數據並進行數據分析


視頻地址 https://www.bilibili.com/bangumi/play/ss39462?spm_id_from=333.851.b_62696c695f7265706f72745f616e696d65.52
彈幕地址 固定的url地址 + 視頻的cid+.xml -- 源碼搜索cid
比如:https://comment.bilibili.com/428471132.xml  

數據獲取部分
# 完整代碼
#   獲取數據
import requests
from bs4 import BeautifulSoup
import pandas as pd

cid = 428471132
url = "https://comment.bilibili.com/{}.xml".format(cid)
response  = requests.get(url)
response.encoding = "utf-8"
#print(response.text)

#  解析數據
soup = BeautifulSoup(response.text,"lxml")
datas = soup.select('d')
#print(datas[0])

# 獲取彈幕文字內容
comments = [data.text for data in datas]  
#print(comments)

#  屬性信息
#  出現時間點 模式 字體 顏色 發送時間 彈幕詞 用戶ID  rowID 等
info_comments = [data.get('p').split(',') for data in datas] #  獲取彈幕屬性信息
#print(info_comments)

# 數據存儲  
columns = ["出現時間點","模式","字體","顏色","發送時間","彈幕池","用戶ID","rowID","未知參數"]
comment_datas = pd.DataFrame(info_comments,columns=columns)
#print(comment_datas)

# 數據組合
comment_datas["comments"] = comments
#print(comment_datas)
# 數據存儲
comment_datas.to_csv("comments.csv",encoding="utf-8-sig")
print("finish...")
數據分析部分

一 繪制詞雲圖
 
         

  # 加載數據
  import pandas as pd
  comment_datas = pd.read_csv("comments.csv",encoding="utf-8-sig")
  print(comment_datas)

##  繪制詞雲圖
import jieba 
from tkinter import _flatten
import matplotlib.pyplot as plt 
from wordcloud import WordCloud 

#   數據獲取
comments = comment_datas["comments"]
#    分詞
jieba.load_userdict("hong.txt")  #  加載用戶自定義詞典
comments_cut = comments.apply(jieba.lcut)  # 對彈幕進行分詞
#print(comments_cut)

#  去除停用詞
with open("stoplist.txt","r",encoding="utf-8") as f:
    stop_words = f.read()
stop_words += "\n"
stop_words += ""
comments_after = comments_cut.apply(lambda x:[i for i in x if i not in stop_words])
#print(comments_after)

#    詞頻統計
results = _flatten(list(comments_after))
#print(results)
word_count=pd.Series(results).value_counts()
#print(word_count)

#    繪制詞雲  https://tool.lu/cutout/
pic = plt.imread("aixin.jpg")  #  讀取一張詞雲輪廓
word_cloud = WordCloud(mask=pic,background_color='white',font_path="C:\Windows\Fonts\simhei.ttf")
word_cloud.fit_words(word_count)
plt.imshow(word_cloud)
plt.axis('off')
二 分析彈幕數量與日期,時間的關系
#  分析彈幕數量與日期,時間的關系


#  加載數據
import pandas as pd
from datetime import datetime

comment_datas = pd.read_csv("comments.csv",encoding="utf-8-sig")
comment_datas["發送時間"] = comment_datas["發送時間"].apply(lambda x :datetime.fromtimestamp(x).strftime('%Y-%m-%d %H:%M:%S'))
#print(comment_datas)

#  分析彈幕數量與日期,時間的關系

userID = comment_datas["用戶ID"]
#print(userID)
#  每個用戶發送多少次彈幕
userID_count = comment_datas["用戶ID"].value_counts()
#print(userID_count)

#  求取發送次數彈幕的用戶量
userID_count_count = comment_datas["用戶ID"].value_counts().value_counts()
#print(userID_count_count)

#  排序依據大小排列
userID_count_count_sort = comment_datas["用戶ID"].value_counts().value_counts().sort_index()
print(userID_count_count_sort)

#num = userID_count_count_sort[:6]
num = userID_count_count_sort[6:]
#num.append(userID_count_count_sort[6:].sum())
print(num.sum())

##  繪制條形圖
import matplotlib.pyplot as plt 
num = userID_count_count_sort[:6]
plt.style.use('ggplot')
plt.rcParams['font.sans-serif'] = 'SimHei'
plt.bar(range(6),num)
plt.xlabel("彈幕數量")
plt.ylabel("用戶數量")
plt.title("彈幕發布數量分布圖")
plt.show()

##  彈幕數量隨時間變化圖
#  去除時分秒的影響
dates = pd.to_datetime(comment_datas["發送時間"])
dates = [date.date() for date in dates]
dates = pd.Series(dates)
num = dates.value_counts().sort_index()
#print(date_counts)

#  繪制折線圖
plt.figure(figsize=(16,9))
plt.plot(range(len(num)),num)
#plt.xticks(range(len(num))[::7],num.index[::7],rotation=45)
plt.xticks(range(len(num)),num.index,rotation=45)
plt.ylabel("彈幕數量")
plt.xlabel("日期變化")
plt.title("彈幕發布數量隨日期變化圖")
plt.show()


###  分析彈幕數量與日期,時間的關系 -- 以周為研究對象
import pandas as pd
comment_datas = pd.read_csv("comments.csv",encoding="utf-8-sig")
#comment_datas["發送時間"]
comment_datas["發送時間"] = comment_datas["發送時間"].apply(lambda x :datetime.fromtimestamp(x).strftime('%Y-%m-%d %H:%M:%S'))

dates = pd.to_datetime(comment_datas["發送時間"])
#print(dates)
date = pd.Series(dates.dt.weekday)
#print(date)
date_count = date.value_counts().sort_index()
#print(date_count)

plt.figure(figsize=(16,9))
plt.plot(range(len(date_count)),date_count)
plt.xticks(range(len(date_count)),["周日","周一","周二","周三","周四","周五","周六"],rotation=45)
plt.ylabel("彈幕數量")
plt.xlabel("日期變化")
plt.title("彈幕發布數量隨日期變化圖")
plt.show()
 
        

 

 


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM