python爬蟲及結巴分詞《攀登者》影評分析


《攀登者》影評爬取及分析

0、項目結構

1572940236843

其中simkai.ttf為字體文件,Windows查看系統自帶的字體

C:\Windows\Fonts

一、爬取豆瓣影評數據

# -*- coding: utf-8 -*-
"""爬取豆瓣影評"""
import requests
from lxml import etree
import time

url = "https://movie.douban.com/subject/30413052/comments?start=%d&limit=20&sort=new_score&status=P"

#請求頭
headers = {'Host': 'movie.douban.com',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:70.0) Gecko/20100101 Firefox/70.0',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
#'Accept-Encoding': 'gzip, deflate, br',
'Connection': 'keep-alive',
'Cookie': 'bid=TXwfIvNFTRE; douban-fav-remind=1; __gads=ID=e042951d078c30b3:T=1570518321:S=ALNI_Mbp-ZmoryuBFEnTQy24mwdf0B89ig; __utma=30149280.1448315194.1570518324.1570518324.1572927825.2; __utmz=30149280.1570518324.1.1.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; _pk_id.100001.4cf6=589509e524ead00f.1572927824.1.1572927824.1572927824.; _pk_ses.100001.4cf6=*; __utmb=30149280.0.10.1572927825; __utmc=30149280; __utma=223695111.1094105223.1572927825.1572927825.1572927825.1; __utmb=223695111.0.10.1572927825; __utmc=223695111; __utmz=223695111.1572927825.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); ap_v=0,6.0',
'Upgrade-Insecure-Requests': '1',
'Cache-Control': 'max-age=0'}

if __name__ == '__main__':
    
    f = open("./climb.csv", mode="w", encoding='utf-8')
    f.write("author\tcomment\tvotes\n")
    
    #start:0,20,40,...,200
    for i in range(11):#range左閉右開
        #1拼接url,只能獲取前11頁數據
        if i == 10:#最后一頁
            url_climb = url%(200)
        else:
            url_climb = url%(i*20)
            
        #2發起請求,設置編碼,獲取文本內容
        response = requests.get(url_climb, headers = headers)
        response.encoding = "utf-8"
        text = response.text
        
        #存儲
        #with open("./climb.html", mode="w", encoding="utf-8") as f:
        #    f.write(text)
            
        #使用etree解析
        html = etree.HTML(text)
        comments = html.xpath('//div[@id="comments"]/div[@class="comment-item"]')
        for comment in comments:
            #獲取評論人
            author = comment.xpath('./div[@class="avatar"]/a/@title')[0].strip()
            #獲取評論內容
            p = comment.xpath('.//span[@class="short"]/text()')[0].strip()
            
            #獲取這條評論對應的點贊數
            vote = comment.xpath('.//span[@class="votes"]/text()')[0].strip()
            
            #print(author, p, vote)
            f.write("%s\t%s\t%s\n" % (author,p,vote))
       
        #打印提示信息,並休眠一秒,反爬蟲
        print("第%d頁的數據保存成功" % (i+1))
        time.sleep(1)
            
    f.close()      

二、對評論信息進行情感分析

# -*- coding: utf-8 -*-
"""
pandas:python data analysis lib,返回值為DataFrame(行,列),行是樣本,列為屬性    
"""
import pandas as pd
from snownlp import SnowNLP

# 顯示所有列
pd.set_option('display.max_columns', None)

def convert(comment):
    """將傳入的評論進行情感分析"""
    snow = SnowNLP(str(comment))
    sentiments = snow.sentiments#0(消極評論)-1(積極評論)
    return sentiments

if __name__ =='__main__':
    
    data = pd.read_csv('./climb.csv', '\t')
    #print(data.head(), "\n", data.shape)
    
    #獲取評論數據,進行情感分析,DataFrame就會新增加一列名為‘情感評分’的數據
    data['情感評分'] = data.comment.apply(convert)
    data.sort_values(by='情感評分', ascending=False, inplace=True)
    
    #保存數據
    data.to_csv('./climb_snownlp.csv', sep='\t', index=False, encoding='utf-8')
    
    print(data[:5])
    print(data[-5:])

三、對評論數據進行jieba分詞,生成關鍵詞條形圖和詞雲

# -*- coding: utf-8 -*-

import pandas as pd
import jieba
from jieba import analyse
import matplotlib.pyplot as plt
import numpy as np
import wordcloud
from PIL import Image

if __name__ == '__main__':
    data = pd.read_csv('./climb.csv', sep='\t')
    
    #列表生成式,獲取所有評論信息
    comments = ';'.join([str(c) for c in data['comment'].tolist()])
    #print(comments)
    
    #使用jieba庫對文本進行分詞,返回的是生成器
    gen_ret = jieba.cut(comments)
    seg_words = '/'.join(gen_ret)
    #print(seg_words)
    
    #對分好的詞進行分析,topK返回的關鍵詞個數,withWeight帶着權重
    tags_ret = analyse.extract_tags(seg_words, topK=500, withWeight=True)
    #print(tags_ret)
    #將數據轉換成DataFrame
    df_ret = pd.DataFrame(tags_ret, columns=['詞語', '重要性'])
    df_ret.sort_values(by='重要性', ascending=False, inplace=True)#根據重要性降序排列
    #print(df_ret)
    
    #可視化,500個詞語,選取前20個分析
    plt.barh(y=np.arange(0,20), width=df_ret[:20]['重要性'][::-1])
    plt.ylabel('Importance')
    plt.yticks(np.arange(0,20), labels=df_ret[:20]['詞語'][::-1], fontproperties='KaiTi')
    #保存條形圖!!!保存代碼一定要寫在show之前,dpi表示屏幕像素密度
    plt.savefig('./條形圖_20個keyword.jpg', dpi=200)
    plt.show()
    
    #詞雲操作
    bg = np.array(Image.open('./bg.jpg'))#詞雲的圖片
    words = dict(tags_ret)#將標簽轉為詞典
    cloud = wordcloud.WordCloud(width=1200, height=968,
                        font_path='./simkai.ttf',#字體路徑
                        background_color='white', mask=bg,
                        max_words=500, max_font_size=150)
    #生成詞雲圖片
    word_cloud = cloud.generate_from_frequencies(words)
    plt.figure(figsize=(12,12))
    plt.imshow(word_cloud)
    #詞雲保存
    plt.savefig('./攀登者詞雲.jpg', dpi=200)
    plt.show()  


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM