爬取某些網站的彈幕和評論數據 - Python


本文僅用於學習與交流使用,不具有任何商業價值,如有問題,請與我聯系,我會即時處理。---Python逐夢者。

首先是某果TV。

彈幕。以電影《懸崖之上》為例。彈幕數據所在的文件是動態加載的,打開開發者工具,讓它加載很多數據,然后搜索某一條數據就看到在哪個包里了,然后就是參數變化不同分析。某果TV的視頻播放一分鍾它就會更新一個json數據包,里面包含需要的彈幕數據。動手干。

 1 import csv
 2 import pprint
 3 import random
 4 import time
 5 import requests
 6 import pandas as pd
 7 
 8 f = open('懸崖之上.csv', mode='a', encoding='utf-8-sig', newline='')
 9 csvWriter = csv.DictWriter(f, fieldnames=[
10     '用戶id',
11     '彈幕內容',
12     '獲贊數',
13 ])
14 csvWriter.writeheader()
15 # 請求頭
16 headers = {
17     "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36",
18 }
19 # 開始請求並多頁爬取
20 for page in range(0, 120 + 1):
21     print(f'=====正在爬取第{page}頁數據=====')
22     time.sleep(random.randint(2, 5))
23     url = f'https://bullet-ali.xxx.com/bullet/2021/11/15/005204/12281642/{page}.json'
24     response = requests.get(url=url, headers=headers)
25     # pprint.pprint(response.json())
26     # 直接提取需要的,這里提取一下id和內容
27     for i in response.json()['data']['items']:
28         # 首先是id
29         try:
30             id = i['id']
31         except:
32             id = '未知!'
33         # 其次是內容
34         content = i['content']
35         # 點贊數
36         try:
37             like = i['v2_up_count']
38         except:
39             like = '未獲得!'
40 
41         # # 組織數據
42         # text = pd.DataFrame({'用戶id':[id], '彈幕內容':[content], '獲贊數':[like]})
43         # # print(text)
44         # df = pd.concat([text])
45         dit = {
46             '用戶id':id,
47             '評論內容':content,
48             '獲贊數':like,
49         }
50         print(dit) # 打印是否符合預期
51         csvWriter.writerow(dit) # 逐行寫入內容
52     break # 調試的時候只怕一頁

程序運行結果:

評論。照樣《懸崖之上》為例,爬一波評論數據。本來評論請求的url地址長這樣:https://comment.xxx.com/v4/comment/getCommentList?page=1&subjectType=hunantv2014&subjectId=12281642&callback=jQuery182024636113438271012_1636961381836&_support=10000000&_=1636961383307,有人說callbackhi干擾數據解析,而時間戳不會破壞數據完整性。所以把url改成了:https://comment.xxx.com/v4/comment/getCommentList?page=1&subjectType=hunantv2014&subjectId=12281642&_support=10000000。

 代碼分頁的標准的參數,就是2533/15 = 168.86,也就是最大頁數為169。開干。

 1 """
 2     爬取芒果TV的評論數據
 3 """
 4 import csv
 5 import pprint
 6 import random
 7 import time
 8 
 9 import requests
10 
11 f = open('懸崖之上評論數據.csv', mode='a', encoding='utf-8-sig', newline='')
12 csvWriter = csv.DictWriter(f, fieldnames=[
13     '評論者',
14     '評論創建時間',
15     '評論內容',
16     '被點贊數',
17 ])
18 csvWriter.writeheader() # 寫入頭
19 headers = {
20     'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36',
21 }
22 for page in range(1, 169):
23     print(f'=====正在爬取第{page}頁內容=====')
24     time.sleep(random.randint(2, 5)) # 隨機休眠
25     url = f'https://comment.xxxx.com/v4/comment/getCommentList?page={page}&subjectType=hunantv2014&subjectId=12281642&_support=10000000'
26     response = requests.get(url=url, headers=headers)
27     # print(response.json()['data']['list']) # 這是個列表
28     # pprint.pprint(response.json()['data']['list'])
29     # 提取評論人姓名,評論日期,評論內容和被點贊數
30     for item in response.json()['data']['list']:
31         name = item['user']['nickName'] # 評論人姓名
32         contentCreated = item['date'] # 評論時間,也可以獲取時間戳轉換成本地時間
33         content = item['content'] # 評論內容
34         praiseNum = item['praiseNum'] # 被點贊數
35         dit = {
36             '評論者':name,
37             '評論創建時間':contentCreated,
38             '評論內容':content,
39             '被點贊數':praiseNum,
40         }
41         print(dit)
42         # 寫入到csv
43         csvWriter.writerow(dit) # 逐行寫入
44 print('爬取完成!')

程序運行截圖:

 

其次是某訊視頻

彈幕。進入開發者工具,當視頻播放30秒它就會更新一個json數據包,里面包含需要的彈幕數據。

對比請求的url發現,第一次請求的是15個,后面請求的都是30。

https://mfm.video.xx.com/danmu?otype=json&callback=jQuery19109701649659612566_1637029736329&target_id=7220956568&vid=t0040z3o3la&session_key=0,38,1637029735&timestamp=15&_=1637029736342
https://mfm.video.xx.com/danmu?otype=json&callback=jQuery19109701649659612566_1637029736329&target_id=7220956568&vid=t0040z3o3la&session_key=0,38,1637029735&timestamp=45&_=1637029736342

照樣干掉不必要的callback參數。開干:

 1 import csv
 2 import random
 3 import time
 4 import requests
 5 
 6 f = open('某訊視頻彈幕數據.csv', mode='a', encoding='utf-8-sig', newline='')
 7 csvWriter = csv.DictWriter(f, fieldnames=[
 8     '彈幕發送ID',
 9     '彈幕內容',
10     '獲贊數',
11 ])
12 # 寫入頭
13 csvWriter.writeheader()
14 # 請求頭
15 headers = {
16     'user-agent':'https://mfm.video.xxx.com/danmu?otype=json&callback=jQuery19109701649659612566_1637029736329&target_id=7220956568&vid=t0040z3o3la&session_key=0,38,1637029735&timestamp=165&_=1637029736342',
17     'referer':'https://v.qq.com/',
18 }
19 # 多頁爬取
20 for timestamp in range(15, 7245, 30): #初始為15,7245為視頻總秒長,后面以30遞增
21     time.sleep(random.randint(2, 5)) # 休眠
22     url = f'https://mfm.video.xxx.com/danmu?otype=json&target_id=7220956568&vid=t0040z3o3la&session_key=0,38,1637029735&timestamp={timestamp}&_=1637029736342'
23     # 請求數據
24     response = requests.get(url=url, headers=headers)
25     # print(response.json()) # 如果不干掉url里的callback參數的話得到的數據需要處理才能用json加載
26     # 提取數據
27     for item in response.json()['comments']:
28         id = item['commentid'] # 彈幕發送者id
29         danmu = item['content'] # 獲取到彈幕內容
30         like = item['upcount'] # 獲贊數
31         dit = {
32             '彈幕發送者ID':id,
33             '彈幕內容':danmu,
34             '獲贊數':like,
35         }
36         print(dit)
37         csvWriter.writerow(dit)
38 
39 print('爬取完成!')

程序運行截圖:

 評論獲取。 某訊視頻評論數據在網頁底部,是動態加載的,需要抓包進行分析。

 請求的url是:

https://video.coral.xxx.com/varticle/6655100451/comment/v2?callback=_varticle6655100451commentv2&orinum=10&oriorder=o&pageflag=1&cursor=0&scorecursor=0&orirepnum=2&reporder=o&reppageflag=1&source=132&_=1637030980407
https://video.coral.xxx.com/varticle/6655100451/comment/v2?callback=_varticle6655100451commentv2&orinum=10&oriorder=o&pageflag=1&cursor=6829967729286321250&scorecursor=0&orirepnum=2&reporder=o&reppageflag=1&source=132&_=1637030980410

變化在cursor參數,開始的時候cursor是0,翻頁后的cursor是上一個url的json數據里的data,last字段。也就是可以通過獲取response.json()['data']['last']來獲取curosr。開干:

 1 import csv
 2 import random
 3 import re
 4 import time
 5 import requests
 6 from urllib.parse import unquote
 7 
 8 f = open('某訊視頻評論.csv', mode='a', encoding='utf-8-sig', newline='')
 9 csvWriter = csv.DictWriter(f, fieldnames=[
10     'Id',
11     '評論人',
12     '評論時間',
13     '獲贊數',
14     '評論內容',
15 ])
16 # 寫入頭
17 csvWriter.writeheader()
18 # 請求頭
19 headers = {
20     'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36',
21 }
22 
23 # 每頁的評論數是10,總共有3203,所以最大值是321
24 page = 1 # 初始計數,一定從1開始,從0開始的話,while循環里獲取不到cursor
25 while page < 321:
26     print(f'=====開始爬取{page}頁的內容=====')
27     time.sleep(random.randint(2, 5)) # 隨機休眠
28     # 第一頁的情況,第一頁很重要,因為我們要獲取cursor以供else分支使用
29     if page == 1:
30         # 照樣干掉url里的callback參數
31         url = 'https://video.coral.xxx.com/varticle/6655100451/comment/v2?orinum=10&oriorder=o&pageflag=1&cursor=0&scorecursor=0&orirepnum=2&reporder=o&reppageflag=1&source=132'
32     else:
33         url = f'https://video.coral.xxxx.com/varticle/6655100451/comment/v2?orinum=10&oriorder=o&pageflag=1&cursor={cursor}&scorecursor=0&orirepnum=2&reporder=o&reppageflag=1&source=132'
34     res = requests.get(url=url, headers=headers).json() # 請求數據 第一次請求的實際上是page==1的情況
35     cursor = res['data']['last'] # 第一次請求url后就能得到一個cursor,然后通過循環傳遞給循環分支else
36     print(f'=====要爬取的url為{url}=====')
37     time.sleep(2)
38     # 每次請求完url后獲取數據
39     for item in res['data']['oriCommList']:
40         id = item['id'] # 評論者id
41         # 使用urllib的unquote函數對url進行解碼
42         nickname = unquote(item['custom'])
43         commentTime = item['time'] # 這個是時間是時間戳,存儲的時候再做轉換
44         # 嘗試獲取獲贊數
45         try:
46             like = item.get('up')
47         except:
48             like = 'No'
49         content = item['content'] # 評論內容
50         dit = {
51             'Id':id,
52             '評論人':re.findall('nick=(.*?)&head', nickname)[0], # 通過正則把解碼后的字符串里的名字提取出來
53             '評論時間':time.strftime("%Y-%m-%d %H:%M", time.localtime(int(commentTime))), # 將獲取到的時間戳轉換成本地時間
54             '獲贊數':like,
55             '評論內容':content,
56         }
57         print(dit)
58         csvWriter.writerow(dit) # 逐行寫入csv文檔
59     page+=1 # 翻頁遞增 進入下一次循環
60     time.sleep(random.uniform(2, 3))
61 print('評論爬完成!')

程序運行部分截圖:

 第三是B站

以《EDG奪冠時刻》,B站的上的紀錄片為例進行爬取。視頻地址:https://www.bilibili.com/bangumi/play/ss39849/?from=search&seid=6112754044363142537&spm_id_from=333.337.0.0。

彈幕:

找到視頻,點開右邊的彈幕列表,加載彈幕的時候,得到的數據如下:

 

 假如登錄了的話,可以點擊“查看歷史彈幕”,如果沒登錄,這個按鈕是灰色的。

 歷史彈幕包括2021年12月1日到8日的彈幕。打點擊“查看歷史彈幕”的時候,會出現每天彈幕的數據,得到類似url:https://api.bilibili.com/x/v2/dm/web/history/seg.so?type=1&oid=445826862&date=2021-12-02。發現其中的關聯,開始構造url並開干。

 1 import requests
 2 import pandas as pd
 3 import re
 4 
 5 def data_response(url):
 6     headers = {
 7         "cookie":"_uuid=BE35640F-EB4E-F87D-53F2-7A8FD5D50E3330964infoc; buvid3=D0213B95-F001-4A46-BE4F-E921AE18EB67167647infoc; CURRENT_BLACKGAP=1; CURRENT_QUALITY=0; rpdid=|(u))ku~m)kJ0J'uYJuRRRYmk; video_page_version=v_old_home_17; blackside_state=1; LIVE_BUVID=AUTO1516364619569495; b_lsid=E27592910_17D990B450B; bsource=search_baidu; buvid_fp=D0213B95-F001-4A46-BE4F-E921AE18EB67167647infoc; innersign=1; sid=ipqajpj8; CURRENT_FNVAL=80; PVID=2; fingerprint=23eb07890bf96775d60093211947fae4; buvid_fp_plain=2919B0C8-360F-47D1-8DD1-51FA81536F4E34777infoc; DedeUserID=603136708; DedeUserID__ckMd5=2e5e771f4e696459; SESSDATA=93ba949a,1654503622,fb700*c1; bili_jct=9d8bc6e01fc089192a6aeed373a0333c",
 8         "referer":"https://www.bilibili.com/bangumi/play/ss39849/?from=search&seid=6112754044363142537&spm_id_from=333.337.0.0",
 9         "user-agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36",
10     }
11     response = requests.get(url=url, headers=headers)
12 
13     return response # 返回
14 
15 def main(oid, month):
16     df = pd.DataFrame()
17     url = f'https://api.bilibili.com/x/v2/dm/history/index?month={month}&type=1&oid={oid}'
18     # url = f'https://api.bilibili.com/x/v2/dm/history/index?type=1&oid={oid}&month={month}'
19     list_data = data_response(url).json()['data']  # 拿到所有日期
20     print(list_data)
21     for data in list_data:
22         urls = f'https://api.bilibili.com/x/v2/dm/web/history/seg.so?type=1&oid={oid}&date={data}' # 網址
23         text = re.findall(".*?([\u4E00-\u9FA5]+).*?", data_response(urls).text)
24         for e in text:
25             print(e)
26             data = pd.DataFrame({'彈幕': [e]})
27             df = pd.concat([df, data])
28     df.to_csv('彈幕.csv', encoding='utf-8-sig', index=False, mode='a+')
29 
30 
31 if __name__ == "__main__":
32     oid = '445826862' # 視頻彈幕鏈接的id值
33     month = '2021-12' # 開始日期,這里至於哦2021-12-01到2021-12-08
34     main(oid, month) # 運行程序
35     

這樣就把所有彈幕數據都爬下來了。還有一種爬b站彈幕的方法。有人說B站的彈幕藏在:https://comment.bilibili.com/445826862.xml,其中的數字就是視頻的oid。用瀏覽器打開試了下,確實都在,也來用python爬下。

 1 import requests
 2 import pandas as pd
 3 import re
 4 
 5 def data_get(oid):
 6     headers = {
 7         "cookie": "_uuid=BE35640F-EB4E-F87D-53F2-7A8FD5D50E3330964infoc; buvid3=D0213B95-F001-4A46-BE4F-E921AE18EB67167647infoc; CURRENT_BLACKGAP=1; CURRENT_QUALITY=0; rpdid=|(u))ku~m)kJ0J'uYJuRRRYmk; video_page_version=v_old_home_17; blackside_state=1; LIVE_BUVID=AUTO1516364619569495; b_lsid=E27592910_17D990B450B; bsource=search_baidu; buvid_fp=D0213B95-F001-4A46-BE4F-E921AE18EB67167647infoc; innersign=1; sid=ipqajpj8; CURRENT_FNVAL=80; PVID=2; fingerprint=23eb07890bf96775d60093211947fae4; buvid_fp_plain=2919B0C8-360F-47D1-8DD1-51FA81536F4E34777infoc; DedeUserID=603136708; DedeUserID__ckMd5=2e5e771f4e696459; SESSDATA=93ba949a,1654503622,fb700*c1; bili_jct=9d8bc6e01fc089192a6aeed373a0333c",
 8         "referer": "https://www.bilibili.com/bangumi/play/ss39849/?from=search&seid=6112754044363142537&spm_id_from=333.337.0.0",
 9         "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36",
10     }
11     url = f'https://comment.bilibili.com/{oid}.xml'
12     # 開始請求網頁
13     response = requests.get(url=url, headers=headers)
14     response.encoding = response.apparent_encoding # 自動識別編碼
15     response.encoding = 'utf-8' # 將編碼轉換成utf-8
16     print(response.text) #
17     textData = re.findall('<d p=".*?">(.*?)</d>', response.text) # 表達式的括號里才是需要的彈幕數據,是個列表
18     df = pd.DataFrame()
19     # 開始保存數據
20     for item in textData:
21         print(item)
22         data = pd.DataFrame({'彈幕': [item]})
23         df = pd.concat([df, data])
24     df.to_csv('彈幕1.csv', encoding='utf-8-sig', index=False, mode='a+')
25 
26 if __name__ == "__main__":
27     oid = '445826862'
28     data_get(oid)

看了幾行數據,完全不一樣,可能存儲格式不一樣。不糾結了,接下來爬評論。

評論:

打開開發者工具,按F12查找評論請求的鏈接。得到如下url。

https://api.bilibili.com/x/v2/reply/main?callback=jQuery172024432989634133118_1638953989760&jsonp=jsonp&next=0&type=1&oid=506840377&mode=3&plat=1&_=1638954002015
https://api.bilibili.com/x/v2/reply/main?callback=jQuery172024432989634133118_1638953989760&jsonp=jsonp&next=2&type=1&oid=506840377&mode=3&plat=1&_=1638954002015
https://api.bilibili.com/x/v2/reply/main?callback=jQuery172024432989634133118_1638953989760&jsonp=jsonp&next=3&type=1&oid=506840377&mode=3&plat=1&_=1638954002015

鏈接還是比較奇葩的,參數只有一個有變化就是next=數字。第一頁是0,第二頁是 2,第三頁是3。不是常規的012這種格式。但是實際在網頁中打開012這樣的格式,也是會返回數據的。這里以它的格式為准,照樣把不必要的參數干掉。就是callback參數和最后的時間戳,callback參數會影響json數據解析,時間戳不會。

 1 import csv
 2 import random
 3 import threading
 4 import time
 5 import requests
 6 import pandas as pd
 7 from threading import Thread
 8 
 9 # 請求頭
10 headers = {
11     'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36',
12 }
13 
14 def pdToCsv():
15     df = pd.DataFrame() # pandas數據格式,用於保存
16     try:
17         a = 1 # 約定
18         while True:
19             if a == 1: # 如果a=1,就讓url里的next=0
20                 url = 'https://api.bilibili.com/x/v2/reply/main?jsonp=jsonp&next=0&type=1&oid=506840377&mode=3&plat=1'
21             else:
22                 url = f'https://api.bilibili.com/x/v2/reply/main?jsonp=jsonp&next={a}&type=1&oid=506840377&mode=3&plat=1'
23             # 開始請求url
24             response = requests.get(url=url, headers=headers)
25             time.sleep(random.uniform(2, 5))
26             for i in response.json()['data']['replies']:
27                 uname = i['member']['uname']  # 用戶名稱
28                 sex = i['member']['sex']  # 用戶性別
29                 mid = i['mid']  # 用戶id
30                 current_level = i['member']['level_info']['current_level']  # vip等級
31                 message = i['content']['message'].replace('\n', '')  # 用戶評論
32                 like = i['like']  # 評論點贊次數
33                 ctime = i['ctime']  # 評論時間
34                 data = pd.DataFrame({'用戶名': [uname], '性別': [sex], 'id': [mid],
35                                      'vip等級': [current_level], '評論': [message], '獲贊數': [like],
36                                      '評論時間': [ctime]})
37                 df = pd.concat([df, data])
38             a += 1 # 自增1以便下一次循環
39     except Exception as e:
40         print(e)
41     df.to_csv('我們是冠軍pd.csv', encoding='utf-8-sig') # 保存數據
42     print(df.shape)
43 
44 
45 """也可以向下面這樣寫"""
46 def stringToCsv():
47     f = open('我們是冠軍評論csv.csv', mode='a', encoding='utf-8-sig', newline='') # 打開文件
48     csvWriter = csv.DictWriter(f, fieldnames=[
49         '用戶名',
50         '性別',
51         'id',
52         'vip等級',
53         '評論內容',
54         '獲贊數',
55         '評論時間',
56     ])
57     csvWriter.writeheader() #  寫入頭
58     n = 1
59     while n < 5426 / 10 + 1: # 因為總共有5426條,所以要爬550頁
60         time.sleep(random.uniform(2,5))
61         if n == 1: # 循環的第一個請求url
62             url = 'https://api.bilibili.com/x/v2/reply/main?jsonp=jsonp&next=0&type=1&oid=506840377&mode=3&plat=1'
63         else:
64             url = f'https://api.bilibili.com/x/v2/reply/main?jsonp=jsonp&next={n}&type=1&oid=506840377&mode=3&plat=1'
65 
66         # 開始請求數據
67         response = requests.get(url=url, headers=headers)
68         for i in response.json()['data']['replies']:
69             uname = i['member']['uname']  # 用戶名稱
70             sex = i['member']['sex']  # 用戶性別
71             mid = i['mid']  # 用戶id
72             current_level = i['member']['level_info']['current_level']  # vip等級
73             message = i['content']['message'].replace('\n', '')  # 用戶評論
74             like = i['like']  # 評論點贊次數
75             ctime = i['ctime']  # 評論時間
76             print(uname, sex, mid, current_level, message, like, ctime, sep='|')
77             dit = {
78                 '用戶名': uname,
79                 '性別': sex,
80                 'id': mid,
81                 'vip等級': current_level,
82                 '評論內容': message,
83                 '獲贊數': like,
84                 '評論時間': ctime,
85             }
86             # 逐行寫入
87             csvWriter.writerow(dit)
88     n += 1 # 循環條件,不然會死循環
89     f.close() # 關閉文件
90 
91 if __name__ == "__main__":
92     thread1 = threading.Thread(target=pdToCsv)
93     thread2 = threading.Thread(target=stringToCsv)
94     thread1.start()
95     thread2.start()
96     thread1.join()
97     thread2.join()

 


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM