信息解讀
1446.2345,1,25,16777215,1312863760,0,eff85771,42759017中幾個逗號分割的數據
第一個參數 time 是彈幕出現的時間以秒數為單位。
第二個參數 mode 是彈幕的模式1..3 滾動彈幕 4底端彈幕 5頂端彈幕 6.逆向彈幕 7精准定位 8高級彈幕。
第三個參數 size 是字號, 12非常小,16特小,18小,25中,36大,45很大,64特別大。
第四個參數 color 是字體的顏色以HTML顏色的十進制為准。
第五個參數 timestamp 是Unix格式的時間戳。基准時間為 1970-1-1 08:00:00。
第六個參數 pool 是彈幕池 0普通池 1字幕池 2特殊池。
第七個參數 author 是發送者的ID,用於“屏蔽此彈幕的發送者”功能。
第八個參數 rowid 是彈幕在彈幕數據庫中rowID 用於“歷史彈幕”功能。
最后我們加一個 text 表示彈幕本身的內容。
實現
我們把每條彈幕的所有信息(連同視頻的 cid)一起整到一個字典對象中,最終輸出一個字典對象的列表。
單線程按搜索結果批量爬取
import json
import re
import requests
import json
def GetHTMLContent(url):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'
}
response = requests.get(url, headers=headers)
return response.content.decode("utf-8")
def SaveDanmuList(list, cid):
reDanmu = re.compile(
r'<d p="(.*?),(.*?),(.*?),(.*?),(.*?),(.*?),(.*?),(.*?)">(.*?)</d>')
listDanmu = re.findall(reDanmu, list)
fileOutput = open("output_"+cid+".json", "w", encoding="utf-8")
listDictDanmu = []
for itemDanmu in listDanmu:
dictItemDanmu = {}
dictItemDanmu["cid"] = cid
dictItemDanmu["time"] = itemDanmu[0]
dictItemDanmu["mode"] = itemDanmu[1]
dictItemDanmu["size"] = itemDanmu[2]
dictItemDanmu["color"] = itemDanmu[3]
dictItemDanmu["timestamp"] = itemDanmu[4]
dictItemDanmu["pool"] = itemDanmu[5]
dictItemDanmu["author"] = itemDanmu[6]
dictItemDanmu["rowid"] = itemDanmu[7]
dictItemDanmu["text"] = itemDanmu[8]
listDictDanmu += [dictItemDanmu]
jsonDictDanmu = json.dumps(listDictDanmu)
fileOutput.write(json.dumps(listDictDanmu, ensure_ascii=False,
sort_keys=True, indent=4, separators=(',', ':')))
fileOutput.close()
def GetDanmuByCid(queryCid):
urlDanmuXml = 'https://comment.bilibili.com/'+queryCid+'.xml'
strDanmuXml = GetHTMLContent(urlDanmuXml)
SaveDanmuList(strDanmuXml, queryCid)
def GetCidByBid(queryBid):
urlGetCid = "https://api.bilibili.com/x/player/pagelist?bvid=" + \
queryBid + "&jsonp=jsonp"
strCidJson = GetHTMLContent(urlGetCid)
jsonCid = json.loads(strCidJson)
return str(jsonCid["data"][0]["cid"])
def GetDanmuByBid(queryBid):
queryCid = GetCidByBid(queryBid)
GetDanmuByCid(queryCid)
def GetBidsBySearch(searchKeyword):
urlSearch = "https://search.bilibili.com/all?keyword=" + \
searchKeyword+"&from_source=web_search"
htmlSearch = GetHTMLContent(urlSearch)
reBid = re.compile(r'//www.bilibili.com/video/(.*?)\?from=search')
listBid = re.findall(reBid, htmlSearch)
return listBid
if __name__ == "__main__":
listBid = GetBidsBySearch("記憶碎片")
for itemBid in listBid:
GetDanmuByBid(itemBid)
print("Succeed :)")
多線程並行
我們開了十個線程並行,每個線程負責一頁搜索結果。
import json
import re
import requests
import json
import time
import threading
def GetHTMLContent(url):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'
}
response = requests.get(url, headers=headers)
return response.content.decode("utf-8")
def SaveDanmuList(list, cid):
reDanmu = re.compile(
r'<d p="(.*?),(.*?),(.*?),(.*?),(.*?),(.*?),(.*?),(.*?)">(.*?)</d>')
listDanmu = re.findall(reDanmu, list)
fileOutput = open("output/"+cid+".json", "w", encoding="utf-8")
listDictDanmu = []
for itemDanmu in listDanmu:
dictItemDanmu = {}
dictItemDanmu["cid"] = cid
dictItemDanmu["time"] = itemDanmu[0]
dictItemDanmu["mode"] = itemDanmu[1]
dictItemDanmu["size"] = itemDanmu[2]
dictItemDanmu["color"] = itemDanmu[3]
dictItemDanmu["timestamp"] = itemDanmu[4]
dictItemDanmu["pool"] = itemDanmu[5]
dictItemDanmu["author"] = itemDanmu[6]
dictItemDanmu["rowid"] = itemDanmu[7]
dictItemDanmu["text"] = itemDanmu[8]
listDictDanmu += [dictItemDanmu]
jsonDictDanmu = json.dumps(listDictDanmu)
fileOutput.write(json.dumps(listDictDanmu, ensure_ascii=False,
sort_keys=True, indent=4, separators=(',', ':')))
fileOutput.close()
def GetDanmuByCid(queryCid):
urlDanmuXml = 'https://comment.bilibili.com/'+queryCid+'.xml'
strDanmuXml = GetHTMLContent(urlDanmuXml)
SaveDanmuList(strDanmuXml, queryCid)
def GetCidByBid(queryBid):
urlGetCid = "https://api.bilibili.com/x/player/pagelist?bvid=" + \
queryBid + "&jsonp=jsonp"
strCidJson = GetHTMLContent(urlGetCid)
jsonCid = json.loads(strCidJson)
return str(jsonCid["data"][0]["cid"])
def GetDanmuByBid(queryBid):
queryCid = GetCidByBid(queryBid)
GetDanmuByCid(queryCid)
def GetBidsBySearch(searchKeyword, page=1):
urlSearch = "https://search.bilibili.com/all?keyword=" + \
searchKeyword+"&from_source=web_search&page=" + str(page)
htmlSearch = GetHTMLContent(urlSearch)
reBid = re.compile(r'//www.bilibili.com/video/(.*?)\?from=search')
listBid = re.findall(reBid, htmlSearch)
return listBid
def GetDanmuByBids(listBid):
for itemBid in listBid:
GetDanmuByBid(itemBid)
print("Thread finish")
if __name__ == "__main__":
threadHandles = []
timeStart = time.time()
for page in range(1, 10):
listBid = GetBidsBySearch("記憶碎片", page)
# GetDanmuByBids(listBid)
threadHandles += [threading.Thread(target=GetDanmuByBids,
name="Thread "+str(page), args=(listBid,))]
for threadHandle in threadHandles:
threadHandle.start()
for threadHandle in threadHandles:
threadHandle.join()
timeEnd = time.time()
print("timeused: ", timeEnd-timeStart)
測試結果
用電影名“記憶碎片”作為關鍵詞進行檢索測試,10 個線程,每個線程平均 18 條視頻。在 10.17 秒內,獲取 178 個視頻的彈幕,整理后 JSON 共 8.36 MB(平均約 1KB~4 條彈幕)