Python爬蟲實戰+數據分析+數據可視化（豆瓣八佰電影影評）

本文轉載自查看原文 2021-06-30 21:52 144

一、爬蟲部分

爬蟲說明： 1、本爬蟲是以面向對象的方式進行代碼架構的 2、本爬蟲爬取的數據存入到MongoDB數據庫中 3、爬蟲代碼中有詳細注釋

代碼展示

    import re
   import time
   from pymongo import MongoClient
   import requests
   from lxml import html
    
   class BaBaiSpider():
       def __init__(self):
           self.start_url = 'https://movie.douban.com/subject/26754233/reviews'
           self.url_temp = 'https://movie.douban.com/subject/26754233/reviews?start={}'
           # 由於豆瓣有ip地址訪問的反爬機制 需要登錄賬戶后獲取Cookie信息
           # 有條件的可以使用ip代理池
           self.headers = {
               "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36",
               'Cookie': 'll="118160"; bid=jBJGzgkqoW0; _ga=GA1.2.299310750.1603415173; _vwo_uuid_v2=D02C810B09B328A9291DA2DE0215B1F4E|7b20627b7b4770d357d6251faaad13b7; __yadk_uid=NVdS10Z9dQ70V1AkBBbqmLR6Ny6AQC6R; UM_distinctid=175530c360058f-0cd5eb2121026b-3e604000-144000-175530c3601502; Hm_lvt_19fc7b106453f97b6a84d64302f21a04=1603416111; __utmv=30149280.22554; douban-fav-remind=1; __gads=ID=9b3fe7aa29748925-22a3ff1066c400c6:T=1603618426:RT=1603618426:S=ALNI_MZdkcEBUdorLQd-nNQm0ECaz6aPgQ; __utmc=30149280; __utmc=223695111; ap_v=0,6.0; _pk_ref.100001.4cf6=%5B%22%22%2C%22%22%2C1610800679%2C%22https%3A%2F%2Faccounts.douban.com%2F%22%5D; _pk_ses.100001.4cf6=*; push_doumail_num=0; push_noty_num=0; dbcl2="225547599:+KzDIeqUyH8"; ck=S_qd; __utmt=1; douban-profile-remind=1; __utma=30149280.299310750.1603415173.1610800679.1610803327.13; __utmb=30149280.0.10.1610803327; __utmz=30149280.1610803327.13.11.utmcsr=baidu|utmccn=(organic)|utmcmd=organic|utmctr=%E8%B1%86%E7%93%A3%E7%94%B5%E5%BD%B1; __utma=223695111.299310750.1603415173.1610800679.1610803327.7; __utmb=223695111.0.10.1610803327; __utmz=223695111.1610803327.7.6.utmcsr=baidu|utmccn=(organic)|utmcmd=organic|utmctr=%E8%B1%86%E7%93%A3%E7%94%B5%E5%BD%B1; _pk_id.100001.4cf6=77003652978e8b92.1603415561.6.1610803542.1610797625.'
           }
           # 初始化MongoDB數據庫
           self.client = MongoClient()
           self.collection = self.client['test']['babai']
    
       # 構造列表頁url
       def get_url_list(self,total_page):
           return [self.url_temp.format(i*20) for i in range(int(total_page)+1)]
    
       # 請求並解析url地址
       def parse_url(self,url):
           rest = requests.get(url,headers=self.headers)
           time.sleep(2)
           return rest.content.decode()
    
       # 獲取並解析列表頁評論數據
       def get_item(self,str_html):
           new_html = html.etree.HTML(str_html)
           div_list = new_html.xpath('//div[@class="review-list "]/div')
           # 獲取信息多采用三目運算符的方式 防止因獲取的內容不存在而報異常
           # 通過三目運算符進行多重判斷可以增加程序的穩定性
           for i in div_list:
               item = {}
               title = i.xpath('.//div[@class="main-bd"]/h2/a/text()')
               item['評論標題'] = title[0] if len(title)>0 else None
               name = i.xpath('.//a[@class="name"]/text()')
               item['評論人姓名'] = name[0] if len(name)>0 else None
               rate = i.xpath('.//span[contains(@class,"main-title-rating")]/@title')
               item['評價'] = rate[0] if len(rate)>0 else None
               time = i.xpath('.//span[@class="main-meta"]/text()')
               item['評論時間'] = time[0] if len(time) > 0 else None
               favor = i.xpath('.//div[@class="action"]/a[1]/span/text()')
               item['贊成數'] = favor[0].strip() if len(favor)>0 else None
               oppose = i.xpath('.//div[@class="action"]/a[2]/span/text()')
               item['反對數'] = oppose[0].strip() if len(oppose)>0 else None
               reply = i.xpath('.//a[@class="reply "]/text()')
               item['回復數'] = reply[0].split('回應')[0] if len(reply)>0 else None
               star = i.xpath('.//span[contains(@class,"main-title-rating")]/@class')
               item['評論得分'] = re.findall(r'allstar(\d)0 main-title-rating',star[0])[0] if len(star)>0 else None
               print(item)
               self.save(item)
                
       # 保存評論數據
       def save(self,item):
           self.collection.insert(item)
    
       def run(self):
           # 獲取數據總頁碼數
           rest = requests.get(self.start_url,headers=self.headers)
           str_html = html.etree.HTML(rest.content.decode())
           total_page= str_html.xpath('//div[@class="paginator"]/a[last()]/text()')[0]
           url_list = self.get_url_list(total_page)
           for url in url_list:
               old_html = self.parse_url(url)
               self.get_item(old_html)
    
   if __name__ == '__main__':
       babai = BaBaiSpider()
       babai.run()

二、數據分析和數據可視化部分

數據分析和數據可視化說明： 1、本博客通過Flask框架來進行數據分析和數據可視化 2、項目的架構圖為在這里插入圖片描述

代碼展示

數據分析代碼展示（analysis.py）

    from pymongo import MongoClient
   import pandas as pd
   import jieba
   import pymysql
   from wordcloud import WordCloud
   from matplotlib import pyplot as plt
   import cv2 as cv
   import numpy as np
    
   # 評論標題詞雲
   def word_cloud(df):
       title_list = df['評論標題'].tolist()
       pro_title_list = [' '.join(list(jieba.cut(i))) for i in title_list]
       cut_text = ' '.join(pro_title_list)
    
       # 讀入圖片背景
       # 對於中文詞雲首先使用jieba來中文分詞，然后還要記得指定font_path設置字體識別
       # 想要的話還能設置詞雲的背景圖片
    
       background_image = cv.imread(r'../static/images/love.jpeg') # 不設置background_image可以不加這行，得到的詞雲就是矩形了
    
       word_cloud = WordCloud(font_path="C:/Windows/Fonts/simfang.ttf", mask=background_image,
                             background_color='white').generate(cut_text)
       plt.figure(figsize=(10,10))
       plt.imshow(word_cloud,interpolation="bilinear")
       plt.axis("off")
       # 將詞雲圖保存到靜態文件的images目錄下 方便后續的展示
       plt.savefig(r'../static/images/wordCount.jpg')
       plt.show()
    
   # 評論數量隨時間的變化
   def hour_count(df):
       # 按照小時進行分組求出不同時刻的評論數量
       grouped = df.groupby('評論小時')['評論標題'].count().reset_index()
       data = [[i['評論小時'],i['評論標題']] for i in grouped.to_dict(orient='records')]
       print(data)
       return data
    
   # 不同評價星級的數量
   def star_count(df):
       # 按照評論星級進行分組求不同星級評價的數量
       grouped = df.groupby('評論得分')['評論標題'].count().reset_index()
       data = [[i['評論得分'],i['評論標題']] for i in grouped.to_dict(orient='records')]
       return data
    
   # 評分均值隨時間的變化
   def star_avg(df):
       # 將評論小時列數據轉換成int類型 方面后續求均值
       df['評論得分'] = df['評論得分'].apply(lambda x:int(x))
       grouped = df.groupby('評論小時')['評論得分'].mean().reset_index()
       data = [[i['評論小時'],round(i['評論得分'],1)] for i in grouped.to_dict(orient='records')]
       return data
    
    
   if __name__ == '__main__':
       client = MongoClient()
       collection = client['test']['babai']
       comments = collection.find({},{'_id':0})
    
       df = pd.DataFrame(comments)
       print(df.info())
       print(df.head(1))
    
       # 刪除評論或評論得分中為NaN的數據
       df.dropna(how='any',inplace=True)
    
       # 將贊成數、反對數中為空的值轉變成0
       df['贊成數'] = df['贊成數'].apply(lambda x:int(x)if len(x)>0 else 0)
       df['反對數'] = df['反對數'].apply(lambda x:int(x)if len(x)>0 else 0)
    
       # 轉換時間類型為pandas時間類型
       df['評論時間'] = pd.to_datetime(df['評論時間'])
       date = pd.DatetimeIndex(df['評論時間'])
       # 增加小時字段
       df['評論小時'] = date.hour
    
       # 評論標題詞雲
       # word_cloud(df)
    
       # 評論數量隨時間的變化
       # data = hour_count(df)
    
       # 不同星級評價的數量
       # data = star_count(df)
    
       # 評分均值隨時間的變化
       data = star_avg(df)
    
    
       # 創建數據庫連接
       conn = pymysql.connect(host='localhost',user='root',password='123456',port=3306,database='babai',charset='utf8')
       with conn.cursor() as cursor:
           # 評論數量隨時間的變化
           # sql = 'insert into db_hour_count(hour,count) values(%s,%s)'
    
           # 不同星級評價的數量
           # sql = 'insert into db_star_count(star,count) values(%s,%s)'
    
           # 評分均值隨時間的變化
           sql = 'insert into db_star_avg(hour,star_avg) values(%s,%s)'
           try:
               result = cursor.executemany(sql,data)
               if result:
                   print('插入數據成功')
                   conn.commit()
           except pymysql.MySQLError as error:
               print(error)
               conn.rollback()
           finally:
               conn.close()

數據庫模型文件展示（models.py）

    from . import db
    
   # 時刻與評論數量關系模型
   class HourCount(db.Model):
       __tablename__ = 'db_hour_count'
       id = db.Column(db.Integer,primary_key=True,autoincrement=True)
       hour = db.Column(db.Integer,nullable=False)
       count = db.Column(db.Integer,nullable=False)
    
   # 評價星級與評價數量關系模型
   class StarCount(db.Model):
       __tablename__ = 'db_star_count'
       id = db.Column(db.Integer,primary_key=True,autoincrement=True)
       star = db.Column(db.Integer,nullable=False)
       count = db.Column(db.Integer,nullable=False)
    
   # 評分均值與隨時間關系模型
   class StarAvg(db.Model):
       __tablename__ = 'db_star_avg'
       id = db.Column(db.Integer,primary_key=True,autoincrement=True)
       hour = db.Column(db.Integer,nullable=False)
       star_avg = db.Column(db.Float,nullable=False)

配置文件代碼展示（config.py）

    class Config(object):
       SECRET_KEY = 'ma5211314'
       SQLALCHEMY_DATABASE_URI = 'mysql://root:123456@localhost:3306/cateye'
       SQLALCHEMY_TRACK_MODIFICATIONS = True
    
   class DevelopmentConfig(Config):
       DEBUG = True
    
   class ProjectConfig(Config):
       pass
        
   # 采用映射方式方便后續調用配置類
   config_map = {
       'develop':DevelopmentConfig,
       'project':ProjectConfig
   }

主工程目錄代碼展示（api_1_0/ init .py）

    from flask import Flask
   from flask_sqlalchemy import SQLAlchemy
   import pymysql
   from config import config_map
   # python3的pymysql取代了mysqldb庫 為了防止出現 ImportError: No module named ‘MySQLdb’的錯誤
   pymysql.install_as_MySQLdb()
    
   db = SQLAlchemy()
   # 采用工廠模式創建app實例
   def create_app(mode='develop'):
       app = Flask(__name__)
       # 加載配置類
       config = config_map[mode]
       app.config.from_object(config)
    
       # 加載數據庫
       db.init_app(app)
    
       # 導入藍圖
       from . import view
       app.register_blueprint(view.blue,url_prefix='/show')
    
       return app

主程序文件代碼展示（manager.py）

    from api_1_0 import create_app,db
   from flask_script import Manager
   from flask_migrate import Migrate,MigrateCommand
   from flask import render_template
    
   app = create_app()
    
   manager = Manager(app)
   Migrate(app,db)
    
   manager.add_command('db',MigrateCommand)
    
   # 首頁
   @app.route('/')
   def index():
       return render_template('index.html')
    
   if __name__ == '__main__':
       manager.run()

視圖文件代碼展示（api_1_0/views/_ init _.py，show.py）

_ init _.py

    from flask import Blueprint
   # 為了在主程序運行時能夠加載到模型類
   from api_1_0 import model
   blue = Blueprint('show',__name__)
    
   # 導入定義的視圖函數
   from . import show

show.py

    from . import blue
   from api_1_0.models import HourCount,StarCount,StarAvg
   from flask import render_template
    
   # 詞雲圖
   @blue.route('/drawCloud')
   def drawCloud():
       return render_template('drawCloud.html')
    
   # 評論數量隨時間的變化折線圖和評論均值隨時間的變化折線圖
   @blue.route('/drawLine')
   def drawLine():
       hour_count = HourCount.query.all()
       hour_star_avg = StarAvg.query.all()
       # 構造折線圖所需數據 兩個數組
       hour = [i.hour for i in hour_count]
       count = [i.count for i in hour_count]
       star_avg = [i.star_avg for i in hour_star_avg]
    
       return render_template('drawLine.html',**locals())
    
   # 不同星級評價的數量占比圖
   @blue.route('/drawPie')
   def drawPie():
       star_count = StarCount.query.all()
       # 構造畫餅圖所需數據格式數組嵌套字典
       data = [{'name':i.star,'value':i.count} for i in star_count]
       return render_template('drawPie.html',**locals())

主頁展示（index.html）

主頁簡單創建了三個超鏈接指向對應的圖表

    <!DOCTYPE html>
    <html lang="en">
    <head>
        <meta charset="UTF-8">
        <title>首頁說明</title>
        <style>
            .container{
                width: 100%;
                height: 600px;
                padding: 40px;
                line-height: 60px;
            }
            ul{
                margin: auto;
                width: 60%;
            }
        </style>
    </head>
    <body>
        <div class="container">
            <ul>
                <li><a href="http://127.0.0.1:5000/show/drawCloud" target="_blank"><h3>評論標題詞雲                                                                                                                                                                                </h3></a></li>
                <li><a href="http://127.0.0.1:5000/show/drawLine" target="_blank"><h3>評論數量隨時間的變化折線圖&評論均值隨時間的變化折線圖</h3></a></li>
                <li><a href="http://127.0.0.1:5000/show/drawPie" target="_blank"><h3>不同星級評價的數量占比圖</h3></a></li>
            </ul>
        </div>
    </body>
    </html>

模板文件代碼展示（drawCloud.html，drawLine.htm，drawPie.html）

drawCloud.html

    <!DOCTYPE html>
    <html lang="en">
    <head>
        <meta charset="UTF-8">
        <title>詞雲圖</title>
        <style>
            .container{
                width: 1000px;
                margin: auto;
                padding-top: 50px;
            }
            img{
                width: 800px;
                height: 600px;
            }
    
        </style>
    </head>
    <body>
    <div class="container">
    	# 圖片地址為數據分析中生成的保存的詞雲圖
        <img src="../static/images/wordCount.jpg">
    </div>
    </body>
    </html>

在這里插入圖片描述

結論：除了電影和一些常用詞之后，英雄、歷史、戰爭的詞頻最高，所以可以初步判斷八佰是以歷史戰爭為題材的電影

drawLine.html

    <!DOCTYPE html>
    <html lang="en">
    <head>
        <meta charset="UTF-8">
        <title>評論數量隨時間的變化折線圖和評論均值隨時間的變化折線圖</title>
        <script src="../static/js/echarts.min.js"></script>
        <script src="../static/theme/vintage.js"></script>
        <style>
            .chart_group{
                width: 100%;
                display: flex;
                justify-content: space-between;
                padding: 50px;
                box-sizing: border-box;
            }
        </style>
    </head>
    <body>
    <div class="chart_group">
        <div class="chat1" style="width: 700px;height: 500px"></div>
        <div class="chat2" style="width: 700px;height: 500px"></div>
    </div>
    <script>
        var myCharts1 = echarts.init(document.querySelector('.chat1'),'vintage')
        var myCharts2 = echarts.init(document.querySelector('.chat2'),'vintage')
        var hour = {{ hour|tojson }}
        var count = {{ count|tojson }}
        var star_avg = {{star_avg|tojson }}
    
    function getOptions(category,data,title_text,desc){
            var option  = {
            title:{
                text:title_text,
                textStyle:{
                    fontFamily:'楷體',
                    fontSize:21
                }
            },
            xAxis:{
                type:'category',
                data: category,
                axisLabel:{
                    interval:0,
                    rotate:40,
                    margin:10
                }
            },
            yAxis:{
                type:'value',
                scale:true
            },
            legend:{
                name:[desc],
                top:20
            },
            tooltip:{
              trigger:'axis',
              triggerOn:'mousemove',
              formatter:function(arg){
                  return '評論時刻：'+arg[0].name+':00'+'<br>'+'評論數量：'+arg[0].value
              }
            },
            series:[
                {
                    name:desc,
                    type:'line',
                    data:data,
                    label:{
                        show:true
                    },
                    smooth:true,
                    markLine:{
                        data:[
                            {
                                name:'平均值',
                                type:'average',
                                label: {
                                    show:true,
                                    formatter:function(arg)
                                    {
                                        return arg.name+':\n'+arg.value
                                    }
                                }
                            }
                        ]
                    },
                    markPoint:{
                        data:[
                            {
                                name:'最大值',
                                type:'max',
                                symbolSize:[40,40],
                                symbolOffset:[0,-20],
                                label:{
                                    show:true,
                                    formatter:function (arg)
                                    {
                                        return arg.name
                                    }
                                }
                            },
                            {
                                name:'最小值',
                                type:'min',
                                symbolSize:[40,40],
                                symbolOffset:[0,-20],
                                label:{
                                    show:true,
                                    formatter:function (arg)
                                    {
                                        return arg.name
                                    }
                                }
                            }
                        ]
                    }
                }
            ]
        }
        return option
        }
    
        var option1 = getOptions(hour,count,'評論數量隨時間的變化','評論數量')
        var option2 = getOptions(hour,star_avg,'評論均值隨時間的變化','評論均值')
        myCharts1.setOption(option1)
        myCharts2.setOption(option2)
    </script>
    </body>
    </html>

在這里插入圖片描述結論：影迷們大都在21點至凌晨1點左右觀影評論，可見影迷們大都是夜貓子，而凌晨1點至中午11點影評的評分普遍低於平均分，熬夜和中午吃飯之前影迷們的大都處在一個心情不大好的狀態，所以一點要少熬夜多吃飯

draw.html

    <!DOCTYPE html>
    <html lang="en">
    <head>
        <meta charset="UTF-8">
        <title>不同星級評價的數量占比圖</title>
        <script src="../static/js/echarts.min.js"></script>
        <script src="../static/theme/vintage.js"></script>
    </head>
    <body>
    <div class="chat" style="width: 800px;height: 600px;margin: auto"></div>
    <script>
        var myCharts = echarts.init(document.querySelector('.chat'),'vintage')
        var data = {{ data|tojson }}
        var option = {
            title:{
                text:'不同星級評價的數量占比',
                textStyle:{
                    fontFamily:'楷體',
                    fontSize:21
                }
            },
            legend:{
                name:['星級'],
                left:40,
                bottom:40,
                orient:'verticals',
                formatter:function(arg)
                {
                    return arg+'星'
                }
            },
            tooltip:{
              trigger:'item',
              triggerOn:'mousemove',
              formatter:function(arg)
              {
                  return '評價星級：'+arg.name+'星'+'<br>'+'評價數量：'+arg.value+'<br>'+'評價占比：'+arg.percent+"%"
              },
            },
            series:[
                {
                    name:'星級',
                    type:'pie',
                    data:data,
                    label:{
                        show:true,
                        formatter:function (arg)
                        {
                            return arg.name+'星'
                        }
                    },
                    {#roseType:'radius',#} //南丁格爾玫瑰圖
                    radius:['50%','80%'],
                    selectedMode:'multiple',
                    selectedOffset:20
                }
            ]
        }
        myCharts.setOption(option)
    </script>
    </body>
    </html>

在這里插入圖片描述

結論：影迷們對八佰這部電影的評價普遍很高，5星和4星評論占總評分的80%左右，可見這部電影的受歡迎程度。

以下是項目源碼，希望能夠幫助你們，如有疑問，下方評論 flask項目代碼鏈接

免責聲明！

本站轉載的文章為個人學習借鑒使用，本站對版權不負任何法律責任。如果侵犯了您的隱私權益，請聯系本站郵箱yoyou2525@163.com刪除。

猜您在找 【python數據分析實戰】電影票房數據分析(二)數據可視化爬蟲:python采集豆瓣影評信息並進行數據分析實現爬蟲、數據分析及可視化 Python爬蟲+數據分析+數據可視化（分析《雪中悍刀行》彈幕） Python數據分析實戰（3）Python實現數據可視化【python】B站彈幕數據分析及可視化（爬蟲+數據挖掘) 《Python數據分析》筆記——數據可視化 Python數據分析~seaborn數據可視化爬取豆瓣電影評分top250數據分析數據分析與可視化