爬取b站排行榜並存到mysql中

目的

b站是我平時看得最多的一個網站，最近接到了一個爬蟲的課設。首先要選擇一個網站，並對其進行爬取，最后將該網站的數據存儲並使其可視化。

網站的結構

目標網站:bilibili排行榜

bilibili排行榜的地址

網頁的層次

首先要確定要提取的信息，也就是標題、播放量、作者up主、評分、播放量和評論量

在網頁源代碼中找到要找的信息

每個網頁中大概有多條這樣的rank-item項目，要提取的信息就包含其中

<li class="rank-item">
     <div class="num">3</div>
         <div class="content">
           <div class="img">
           <a href="//bangumi.bilibili.com/anime/28016" target="_blank">
                   <div class="lazy-img cover"><img alt="女高中生的虛度日常" src=""></div>
           </a>
           <!---->
     </div>
       <div class="info">
           <a href="//bangumi.bilibili.com/anime/28016" target="_blank"class="title">
           女高中生的虛度日常
           </a>
       <div class="bangumi-info">全12話</div>
       <div class="detail">
       <span class="data-box">
           <i class="b-icon play"></i>
           3491.1萬
       </span>
       <span class="data-box">
           <i class="b-icon view"></i>
           74.3萬
       </span>
       <span class="data-box">
           <i class="fav">
           </i>
           176.4萬
       </span></div>
       <div class="pts">
             <div>2218000</div>綜合得分
       </div>
   </div>                              
   </div>
</li>

1.名稱在title類的a標簽下
2.播放量、評論數、和up主在data-box類的span標簽下
3.綜合評分在pts類的div標簽下

對應解析其的代碼

def getPage(url):#爬取單個頁面,核心代碼
    spider=Spider(url)
    spider.setSoup()
    itemList=spider.findTagByAttrs('li','rank-item')
    pageContentList=[]
    for item in itemList:
        pageContentItem=[]
        for title in item.find_all('a','title'):
            pageContentItem.append(title.string)
            # print(title.string)
            
        for playnum in item.find_all('span','data-box'):
            pattern=r">([^<]+)<"
            n=re.findall(pattern,playnum.__str__())[0]
            pageContentItem.append(n)
            # print(n)
        
            
        # print(item.find_all('div','pts')[0].div.string)
        pageContentItem.append(item.find_all('div','pts')[0].div.string)
        pageContentList.append(pageContentItem)
    return pageContentList

網站的層次

通過觀察連接參數的變化

https://www.bilibili.com/ranking/all/0/0/3

以這個鏈接為例，通過實驗，該網頁鏈接的參數代表各種意義,ranking代表排行，all代表是否是全站還是原創，第一個參數0代表,各個分區，第二個參數0代表了全部投稿還是近期投稿，第三個參數3代表了是三日內投遞的，根據實驗規律，得到了生成連接的代碼，但是只有全站榜和原創榜支持這個規律，其他的暫時沒爬

def getURLFormBilibili():# 獲取各種各樣排行的榜單的信息
    date={
        1:'日排行',
        3:'三日排行',
        7:'周排行',
        30:'月排行'
    }
    areatype={
        0:'全站',
        1:'動畫',
        168:'國漫相關',
        3:'音樂',
        129:'舞蹈',
        4:'游戲',
        36:'科技',
        188:'數碼',
        160:'生活',
        119:'鬼畜',
        155:'時尚',
        5:'娛樂',
        181:'影視'
    }
    ranktype={
       'all':'全站',
       'origin':'原創'
    }
    submit={
        '0':'全部投稿',
        '1':'近期投稿'
    }
    urlDict={}#存放相應url的字典
    for ranktypeItem in ranktype.keys():
        for areatypeItem in areatype.keys():
            for submitItem in submit.keys():
                for dateTypeItem in date.keys():
                    title=ranktype[ranktypeItem]+'_'+areatype[areatypeItem]+'_'+submit[submitItem]+'_'+date[dateTypeItem]
                    destinaTionUrl='https://www.bilibili.com/ranking/{}/{}/{}/{}'.format(ranktypeItem,areatypeItem,submitItem,dateTypeItem)
                    urlDict[title]=destinaTionUrl
    return urlDict

保存到mysql數據庫

這里使用了pymysql這個庫，安裝使用pip就好了，就不再贅述，為了方便我把它寫成了一個類

    
class MysqlConnect:#數據庫的連接類
    def __init__(self):
        pass
    
    def getConnect(self):
        db=coon = pymysql.connect(
            host = 'localhost',user = 'root',passwd = '你的密碼',
            port = 3306,db = 'bilibilirank',charset = 'utf8'
             #port必須寫int類型
             #charset必須寫utf8，不能寫utf-8
        )
        return db
        
    def insertInfo(self,sql):
        db=self.getConnect()
        cursor=db.cursor()
        try:
            cursor.execute(sql)
            db.commit()
            print("sucessed...")
        except:
            print("failed...")
            db.rollback()
            
    
    def queryOutCome(self,sql):
         # 獲取數據庫連接  
        db = self.getConnect()  
        # 使用cursor() 方法創建一個游標對象 cursor  
        cursor = db.cursor()  
        try:  
            # 執行sql語句  
            cursor.execute(sql)  
            result = cursor.fetchone()  
        except: #方法二：采用traceback模塊查看異常  
            #輸出異常信息  
            traceback.print_exc()  
            # 如果發生異常，則回滾  
            db.rollback()  
        finally:  
            # 最終關閉數據庫連接  
            db.close()  
        return result  
    
    def getCreateTableSql(self,tableName):#獲取創建表的sql語句
        sql='''
        create table `{}` (
            id int(11) auto_increment primary key,
            title char(100) NOT NULL UNIQUE,
            playnum char(100) NOT NULL,
            commentnum char(100) NOT NULL,
            author char(100) NOT NULL,
            score char(100) NOT NULL
        )ENGINE=innodb DEFAULT CHARSET=utf8;
        '''.format(tableName)
        return sql
        
    def getInsertToTableSql(self,tableName,title,playnum,commentnum,author,score):
        sql='''
        insert into `{}` values(null,'{}','{}','{}','{}','{}');
        '''.format(tableName,title,playnum,commentnum,author,score)
        return sql
        
        
    
    def createTable(self,tableName,sql):
        db=self.getConnect()
        cursor=db.cursor()
        cursor.execute("drop table if exists %s" %(tableName)) 
        cursor.execute(sql)
        db.close()

爬取數據

按照頁面逐個爬取保存到數據庫

if __name__ == "__main__":
    #開始爬取數據
    urlDict=getURLFormBilibili()#獲取對應的URL信息
    mysqlconnect=MysqlConnect()#用於連接數據庫
    
    for urlName in urlDict:
        print("正在處理"+urlName+"頁面...")
        url=urlDict[urlName]
        tableName=urlName
        createsql=mysqlconnect.getCreateTableSql(tableName)
        mysqlconnect.createTable(tableName,createsql)
        pageList=getPage(url)
        for contentItem in pageList:
            insertsql=mysqlconnect.getInsertToTableSql(tableName,contentItem[0],contentItem[1],contentItem[2],contentItem[3],contentItem[4])
            print(insertsql)
            mysqlconnect.insertInfo(insertsql)

結果

源代碼

import requests
import re
from bs4 import BeautifulSoup
import pymysql
import traceback

class Spider:#常用的爬取方法的簡單封裝
    def __init__(self,url):
        self.url=url
        
    def getHTML(self):#獲取html的對應代碼
        headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.103 Safari/537.36'}
        try:
            response=requests.get(url=self.url,headers=headers,timeout=20)
            response.raise_for_status()
            response.encoding=response.apparent_encoding
            return response.text
        except:
            return "網頁訪問失敗"
        
        
    def setSoup(self):#獲取soup對象
        html=self.getHTML()
        self.soup=BeautifulSoup(html,'html.parser')
    
    def findTag(self,tagName):#按照標簽名查找標簽
        return self.soup.find_all(tagName)
    
    def findTagByAttrs(self,tagName,attrs):
        return self.soup.find_all(tagName,attrs)

    
    def getBeautifyHTML(self):
        return self.soup.prettify()


def getPage(url):#爬取單個頁面,核心代碼
    spider=Spider(url)
    spider.setSoup()
    itemList=spider.findTagByAttrs('li','rank-item')
    pageContentList=[]
    for item in itemList:
        pageContentItem=[]
        for title in item.find_all('a','title'):
            pageContentItem.append(title.string)
            # print(title.string)
            
        for playnum in item.find_all('span','data-box'):
            pattern=r">([^<]+)<"
            n=re.findall(pattern,playnum.__str__())[0]
            pageContentItem.append(n)
            # print(n)
        
            
        # print(item.find_all('div','pts')[0].div.string)
        pageContentItem.append(item.find_all('div','pts')[0].div.string)
        pageContentList.append(pageContentItem)
    return pageContentList     


def getURLFormBilibili():# 獲取各種各樣排行的榜單的信息
    date={
        1:'日排行',
        3:'三日排行',
        7:'周排行',
        30:'月排行'
    }
    areatype={
        0:'全站',
        1:'動畫',
        168:'國漫相關',
        3:'音樂',
        129:'舞蹈',
        4:'游戲',
        36:'科技',
        188:'數碼',
        160:'生活',
        119:'鬼畜',
        155:'時尚',
        5:'娛樂',
        181:'影視'
    }
    ranktype={
       'all':'全站',
       'origin':'原創'
    }
    submit={
        '0':'全部投稿',
        '1':'近期投稿'
    }
    urlDict={}#存放相應url的字典
    for ranktypeItem in ranktype.keys():
        for areatypeItem in areatype.keys():
            for submitItem in submit.keys():
                for dateTypeItem in date.keys():
                    title=ranktype[ranktypeItem]+'_'+areatype[areatypeItem]+'_'+submit[submitItem]+'_'+date[dateTypeItem]
                    destinaTionUrl='https://www.bilibili.com/ranking/{}/{}/{}/{}'.format(ranktypeItem,areatypeItem,submitItem,dateTypeItem)
                    urlDict[title]=destinaTionUrl
    return urlDict

    
    
class MysqlConnect:#數據庫的連接類
    def __init__(self):
        pass
    
    def getConnect(self):
        db=coon = pymysql.connect(
            host = 'localhost',user = 'root',passwd = '你的密碼',
            port = 3306,db = 'bilibilirank',charset = 'utf8'
             #port必須寫int類型
             #charset必須寫utf8，不能寫utf-8
        )
        return db
        
    def insertInfo(self,sql):
        db=self.getConnect()
        cursor=db.cursor()
        try:
            cursor.execute(sql)
            db.commit()
            print("sucessed...")
        except:
            print("failed...")
            db.rollback()
            
    
    def queryOutCome(self,sql):
         # 獲取數據庫連接  
        db = self.getConnect()  
        # 使用cursor() 方法創建一個游標對象 cursor  
        cursor = db.cursor()  
        try:  
            # 執行sql語句  
            cursor.execute(sql)  
            result = cursor.fetchone()  
        except: #方法二：采用traceback模塊查看異常  
            #輸出異常信息  
            traceback.print_exc()  
            # 如果發生異常，則回滾  
            db.rollback()  
        finally:  
            # 最終關閉數據庫連接  
            db.close()  
        return result  
    
    def getCreateTableSql(self,tableName):#獲取創建表的sql語句
        sql='''
        create table `{}` (
            id int(11) auto_increment primary key,
            title char(100) NOT NULL UNIQUE,
            playnum char(100) NOT NULL,
            commentnum char(100) NOT NULL,
            author char(100) NOT NULL,
            score char(100) NOT NULL
        )ENGINE=innodb DEFAULT CHARSET=utf8;
        '''.format(tableName)
        return sql
        
    def getInsertToTableSql(self,tableName,title,playnum,commentnum,author,score):
        sql='''
        insert into `{}` values(null,'{}','{}','{}','{}','{}');
        '''.format(tableName,title,playnum,commentnum,author,score)
        return sql
        
        
    
    def createTable(self,tableName,sql):
        db=self.getConnect()
        cursor=db.cursor()
        cursor.execute("drop table if exists %s" %(tableName)) 
        cursor.execute(sql)
        db.close()


if __name__ == "__main__":
    #開始爬取數據
    urlDict=getURLFormBilibili()#獲取對應的URL信息
    mysqlconnect=MysqlConnect()#用於連接數據庫
    
    for urlName in urlDict:
        print("正在處理"+urlName+"頁面...")
        url=urlDict[urlName]
        tableName=urlName
        createsql=mysqlconnect.getCreateTableSql(tableName)
        mysqlconnect.createTable(tableName,createsql)
        pageList=getPage(url)
        for contentItem in pageList:
            insertsql=mysqlconnect.getInsertToTableSql(tableName,contentItem[0],contentItem[1],contentItem[2],contentItem[3],contentItem[4])
            print(insertsql)
            mysqlconnect.insertInfo(insertsql)

免責聲明！

本站轉載的文章為個人學習借鑒使用，本站對版權不負任何法律責任。如果侵犯了您的隱私權益，請聯系本站郵箱yoyou2525@163.com刪除。