該文內容已失效,現已實現scrapy+scrapy-splash來爬取該網站視頻及用戶信息,由於B站的反爬封IP,以及網上的免費代理IP絕大部分失效,
無法實現一個可靠的IP代理池,免費代理網站又是各種反爬,解決反爬后獲取到的有效IP占比極低,不想折騰,因此視頻信息暫時無法成功獲取。
github地址 https://github.com/delav/bstation
該爬蟲可以爬取B站所有視頻的信息:標題,發布時間,鏈接,點擊數,彈幕數,
收藏數,硬幣數,分享數,作者,作者性別,(生日)。輸入你要爬取的起始
視頻編號(如https://www.bilibili.com/video/av15010461,輸入“15010461”)
然后輸入需要爬取的數量,即可。可修改代碼選擇存入數據庫或者Excel文件
沒有用到多進程,多線程,爬取速度有點慢。
注意:起始視頻編號的視頻必須是存在的,如果輸入的起始視頻編號不存在,
會出現錯誤,暫時沒解決
數據存入數據庫,本地必須先安裝MySQL。
代碼如下(由於B站源代碼經常改,只要查看源代碼,修改一些信息的xpath獲取方式即可):
# coding: utf-8
# windows終端運行修改以下:
# i = input("起始視頻編號:".decode('utf-8').encode('gbk'))
# print u"爬取完成"
import requests
import urllib2
import zlib
from lxml import etree
import MySQLdb
import datetime
import os
import xlwt
import multiprocessing
from xlrd import open_workbook
from xlutils.copy import copy
# import random
import warnings
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
warnings.filterwarnings("ignore") # 忽略警告提示
mode_url = 'https://api.bilibili.com/x/web-interface/archive/stat?aid={}'
title_url = 'https://www.bilibili.com/video/av{}'
path = os.getcwd()
file_path = path + os.sep + 'bilibili.xls'
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/49.0.2623.112 Safari/537.36'}
# 獲取所需要的信息列表
def get_info(t_url, m_url):
msg_list = []
try:
request = urllib2.Request(t_url, headers=headers)
request.add_header('Accept-encoding', 'gzip')
opener = urllib2.build_opener()
response = opener.open(request, timeout=10)
html = response.read()
gzipped = response.headers.get('Content-Encoding')
if gzipped:
html = zlib.decompress(html, 16 + zlib.MAX_WBITS)
# print html
html = etree.HTML(html)
raw_mid = html.xpath("//div[@class='u-face fl']/a/@href")
author_url = 'https:' + raw_mid[0]
raw_title = html.xpath("//title[@data-vue-meta='true']/text()")
str_title = raw_title[0].encode('unicode-escape')
real_title = str_title.split('_')[0]
title = real_title.decode('unicode-escape')
msg_list.append(title) # 標題
types = html.xpath("//div[@class='tminfo']/span[last()-1]/a/text()")
msg_list.append(types[0]) # 類型
public_time = html.xpath("//time//i/text()")
msg_list.append(public_time[0]) # 發布時間
# author = html.xpath("//meta[@name='author']/@content")
# msg_list.append(author)
response1 = requests.get(m_url, headers=headers, verify=False, timeout=10)
response2 = requests.get(author_url, headers=headers, verify=False)
print "3333", response1.status_code
print "4444", response2.status_code
if response1.status_code == 200 and response2.status_code == 200:
j1 = response1.json()['data']
aid = 'www.bilibili.com/video/av' + str(j1['aid']) # 地址
view = j1['view'] # 播放量,視頻沒有播放量時顯示會‘--’,不是整數,會抓取失敗
danmaku = j1['danmaku'] # 彈幕
favorite = j1['favorite'] # 收藏
coin = j1['coin'] # 硬幣
share = j1['share'] # 分享
j2 = response2.json()['data']['card']
author = str(j2['name'].encode('utf-8')) # 作者
sex = str(j2['sex'].encode('utf-8')) # 性別
# birthday = str(j2['birthday']) # 生日
msg_list.extend([aid, view, danmaku, favorite, coin, share, author, sex])
except Exception, e:
pass
print e
return msg_list
# 計時裝飾器
def timer(func):
def time_count(*args):
start_time = datetime.datetime.now()
func(*args)
end_time = datetime.datetime.now()
day = (end_time - start_time).days
times = (end_time - start_time).seconds
hour = times / 3600
h = times % 3600
minute = h / 60
m = h % 60
second = m
print "爬取完成"
print "一共用時%s天%s時%s分%s秒" % (day, hour, minute, second)
return time_count
# 把數據存到MySQL數據庫中
def mysql_save(my_list):
conn = MySQLdb.connect(host="localhost",
port=3306,
user="root",
passwd="729814",
charset="utf8")
cur = conn.cursor()
cur.execute("create database if not exists bili")
conn.select_db('bili')
cur.execute("create table if not exists info (title varchar(30),"
"types varchar(10),"
"pub_time varchar(20),"
"aid varchar(50),"
"views int(20),"
"danmaku int(15),"
"favorite int(15),"
"coin int(10),"
"share int(10),"
"author varchar(30),"
"sex varchar(10))")
sql = "insert into info values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)"
cur.execute(sql, my_list)
conn.commit()
# 把數據存到Excel表格中
def excel_save(my_list):
try:
first_row = ['標題', '類型', '發布時間', '地址', '播放量', '彈幕', '收藏', '硬幣', '分享', '作者', '性別']
style_bold = xlwt.easyxf('font: color-index red, bold on')
header_style = style_bold
if not os.path.isfile(file_path):
w = xlwt.Workbook(encoding='utf-8')
ws = w.add_sheet('Sheet 1')
for x, value in enumerate(first_row):
ws.write(0, x, value.decode('utf-8'), header_style)
w.save(file_path)
rexcel = open_workbook(file_path, formatting_info=True)
rows = rexcel.sheets()[0].nrows
excel = copy(rexcel)
table = excel.get_sheet(0)
for y, value in enumerate(my_list):
if type(value) == str:
table.write(rows, y, value.decode('utf-8'))
else:
table.write(rows, y, value)
excel.save(file_path)
except Exception, e:
print e
print "請先關閉bilibili.xls"
# 主函數
@timer
def main(i, n):
print "開始爬取...."
t = 0
while t < n:
t += 1
t_url = title_url.format(i)
m_url = mode_url.format(i)
msg_list = get_info(t_url, m_url)
if len(msg_list) == 11:
# 存到數據庫
mysql_save(msg_list)
# 存到Excel
# excel_save(msg_list)
print "爬取第%s個成功" % t
else:
print "爬取%s失敗失敗" % t
i += 1
if __name__ == '__main__':
num1 = input("起始視頻編號:")
print "---------------------"
num2 = input("需要爬取數量:")
print "---------------------"
main(num1, num2)