Python爬取B站視頻信息

本文轉載自查看原文 2017-11-14 21:57 5213 Python

該文內容已失效，現已實現scrapy+scrapy-splash來爬取該網站視頻及用戶信息，由於B站的反爬封IP，以及網上的免費代理IP絕大部分失效，
無法實現一個可靠的IP代理池，免費代理網站又是各種反爬，解決反爬后獲取到的有效IP占比極低，不想折騰，因此視頻信息暫時無法成功獲取。
github地址 https://github.com/delav/bstation

該爬蟲可以爬取B站所有視頻的信息：標題，發布時間，鏈接，點擊數，彈幕數，

收藏數，硬幣數，分享數，作者，作者性別，（生日）。輸入你要爬取的起始

視頻編號（如https://www.bilibili.com/video/av15010461，輸入“15010461”）

然后輸入需要爬取的數量，即可。可修改代碼選擇存入數據庫或者Excel文件

沒有用到多進程，多線程，爬取速度有點慢。

注意：起始視頻編號的視頻必須是存在的，如果輸入的起始視頻編號不存在，

會出現錯誤，暫時沒解決

　　數據存入數據庫，本地必須先安裝MySQL。

代碼如下（由於B站源代碼經常改，只要查看源代碼，修改一些信息的xpath獲取方式即可）：

# coding: utf-8

# windows終端運行修改以下：
# i = input("起始視頻編號：".decode('utf-8').encode('gbk'))
# print u"爬取完成"

import requests
import urllib2
import zlib
from lxml import etree
import MySQLdb
import datetime
import os
import xlwt
import multiprocessing
from xlrd import open_workbook
from xlutils.copy import copy
# import random
import warnings
import sys

reload(sys)
sys.setdefaultencoding('utf-8')
warnings.filterwarnings("ignore")  # 忽略警告提示
mode_url = 'https://api.bilibili.com/x/web-interface/archive/stat?aid={}'
title_url = 'https://www.bilibili.com/video/av{}'
path = os.getcwd()
file_path = path + os.sep + 'bilibili.xls'
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) '
                         'Chrome/49.0.2623.112 Safari/537.36'}


# 獲取所需要的信息列表
def get_info(t_url, m_url):
    msg_list = []
    try:
        request = urllib2.Request(t_url, headers=headers)
        request.add_header('Accept-encoding', 'gzip')
        opener = urllib2.build_opener()
        response = opener.open(request, timeout=10)
        html = response.read()
        gzipped = response.headers.get('Content-Encoding')
        if gzipped:
            html = zlib.decompress(html, 16 + zlib.MAX_WBITS)
        # print html
        html = etree.HTML(html)
        raw_mid = html.xpath("//div[@class='u-face fl']/a/@href")
        author_url = 'https:' + raw_mid[0]
        raw_title = html.xpath("//title[@data-vue-meta='true']/text()")
        str_title = raw_title[0].encode('unicode-escape')
        real_title = str_title.split('_')[0]
        title = real_title.decode('unicode-escape')
        msg_list.append(title)  # 標題
        types = html.xpath("//div[@class='tminfo']/span[last()-1]/a/text()")
        msg_list.append(types[0])  # 類型
        public_time = html.xpath("//time//i/text()")
        msg_list.append(public_time[0])  # 發布時間
        # author = html.xpath("//meta[@name='author']/@content")
        # msg_list.append(author)

        response1 = requests.get(m_url, headers=headers, verify=False, timeout=10)
        response2 = requests.get(author_url, headers=headers, verify=False)
        print "3333", response1.status_code
        print "4444", response2.status_code
        if response1.status_code == 200 and response2.status_code == 200:
            j1 = response1.json()['data']
            aid = 'www.bilibili.com/video/av' + str(j1['aid'])  # 地址
            view = j1['view']  # 播放量,視頻沒有播放量時顯示會‘--’,不是整數，會抓取失敗
            danmaku = j1['danmaku']  # 彈幕
            favorite = j1['favorite']  # 收藏
            coin = j1['coin']  # 硬幣
            share = j1['share']  # 分享
            j2 = response2.json()['data']['card']
            author = str(j2['name'].encode('utf-8'))  # 作者
            sex = str(j2['sex'].encode('utf-8'))  # 性別
            # birthday = str(j2['birthday'])  # 生日
            msg_list.extend([aid, view, danmaku, favorite, coin, share, author, sex])
    except Exception, e:
        pass
        print e
    return msg_list


# 計時裝飾器
def timer(func):
    def time_count(*args):
        start_time = datetime.datetime.now()
        func(*args)
        end_time = datetime.datetime.now()
        day = (end_time - start_time).days
        times = (end_time - start_time).seconds
        hour = times / 3600
        h = times % 3600
        minute = h / 60
        m = h % 60
        second = m
        print "爬取完成"
        print "一共用時%s天%s時%s分%s秒" % (day, hour, minute, second)
    return time_count


# 把數據存到MySQL數據庫中
def mysql_save(my_list):
    conn = MySQLdb.connect(host="localhost",
                           port=3306,
                           user="root",
                           passwd="729814",
                           charset="utf8")
    cur = conn.cursor()
    cur.execute("create database if not exists bili")
    conn.select_db('bili')
    cur.execute("create table if not exists info (title varchar(30),"
                "types varchar(10),"
                "pub_time varchar(20),"
                "aid varchar(50),"
                "views int(20),"
                "danmaku int(15),"
                "favorite int(15),"
                "coin int(10),"
                "share int(10),"
                "author varchar(30),"
                "sex varchar(10))")
    sql = "insert into info values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)"
    cur.execute(sql, my_list)
    conn.commit()


# 把數據存到Excel表格中
def excel_save(my_list):
    try:
        first_row = ['標題', '類型', '發布時間', '地址', '播放量', '彈幕', '收藏', '硬幣', '分享', '作者', '性別']
        style_bold = xlwt.easyxf('font: color-index red, bold on')
        header_style = style_bold
        if not os.path.isfile(file_path):
            w = xlwt.Workbook(encoding='utf-8')
            ws = w.add_sheet('Sheet 1')
            for x, value in enumerate(first_row):
                ws.write(0, x, value.decode('utf-8'), header_style)
            w.save(file_path)
        rexcel = open_workbook(file_path, formatting_info=True)
        rows = rexcel.sheets()[0].nrows
        excel = copy(rexcel)
        table = excel.get_sheet(0)
        for y, value in enumerate(my_list):
            if type(value) == str:
                table.write(rows, y, value.decode('utf-8'))
            else:
                table.write(rows, y, value)
        excel.save(file_path)
    except Exception, e:
        print e
        print "請先關閉bilibili.xls"


# 主函數
@timer
def main(i, n):
    print "開始爬取...."
    t = 0
    while t < n:
        t += 1
        t_url = title_url.format(i)
        m_url = mode_url.format(i)
        msg_list = get_info(t_url, m_url)
        if len(msg_list) == 11:
            # 存到數據庫
            mysql_save(msg_list)
            # 存到Excel
            # excel_save(msg_list)
            print "爬取第%s個成功" % t
        else:
            print "爬取%s失敗失敗" % t
        i += 1


if __name__ == '__main__':
    num1 = input("起始視頻編號：")
    print "---------------------"
    num2 = input("需要爬取數量：")
    print "---------------------"
    main(num1, num2)

免責聲明！

本站轉載的文章為個人學習借鑒使用，本站對版權不負任何法律責任。如果侵犯了您的隱私權益，請聯系本站郵箱yoyou2525@163.com刪除。

猜您在找 爬取b站互動視頻信息 python 爬取bilibili 視頻信息 2021.11.8 python 爬蟲教學B站爬取up全部視頻彈幕，視頻評論，視頻信息代碼講解（最全、清晰易懂） python 爬取B站視頻彈幕信息 Python如何實現爬取B站視頻 Python爬取b站視頻 Python 自動爬取B站視頻 Python 爬取B站（Bilibili.com）UP主的所有公開視頻鏈接及信息 python爬蟲（BeautifulSoup）爬取B站視頻字幕 Python爬蟲一爬取B站小視頻源碼