python對影評進行評論分析，形成詞雲圖

本文轉載自查看原文 2019-04-06 11:37 572 python_BBS

1 # -*- coding:utf-8 -*-

2 '''

3 抓取豆瓣電影某部電影的評論

4抓取電影

5 網址鏈接:https://movie.douban.com/subject/26630781/comments

6 為了抓取全部評論需要先進行登錄

7 '''

8 from selenium import webdriver

9 import time

10 import codecs

11 import jieba

12 import jieba.analyse as analyse

13 from wordcloud import WordCloud

14 from scipy.misc import imread

15 from os import path

17 def get_douban_comments(url):

18 comments_list = [] # 評論列表

19 login_url = 'https://accounts.douban.com/login?source=movie'

20 user_name = '1111111' # 這里替換成你的豆瓣用戶名

21 password = '11111111' # 這里替換成你的密碼

22 driver = webdriver.Firefox() # 啟動Firefox()

23 driver.get(login_url)

24 driver.find_element_by_id('email').clear() # 清除輸入框

25 driver.find_element_by_id('email').send_keys(user_name) # 輸入用戶名

26 driver.find_element_by_id('password').clear()

27 driver.find_element_by_id('password').send_keys(password) # 輸入密碼

28 captcha_field = raw_input('請打開瀏覽器輸入驗證碼:') # 手動填入驗證碼

29 driver.find_element_by_id('captcha_field').send_keys(captcha_field)

30 driver.find_element_by_class_name('btn-submit').click() # 點擊登錄按鈕

31 time.sleep(5) # 等待跳轉到登錄之后的頁面

32 driver.get(url) # 定位到目標頁面

33 driver.implicitly_wait(3) # 智能等待3秒

34 n = 501 # 頁數

35 count = 10000 # 評論數目

36 while True:

37 try:

38 results = driver.find_elements_by_class_name('comment')

39 for result in results:

40 # author = result.find_elements_by_tag_name('a')[1].text # 作者

41 # vote = result.find_element_by_class_name('comment-vote').find_element_by_tag_name('span').text # 贊同數目

42 # time0 = result.find_element_by_class_name('comment-info').find_elements_by_tag_name('span')[1].text # 時間

43 comment = result.find_element_by_tag_name('p').text # 評論內容

44 comments_list.append(comment+u'\n')

45 print u"查找到第%d個評論" % count

46 count += 1

47 driver.find_element_by_class_name('next').click() # 點擊下一頁

48 print u'第%d頁查找完畢!' % n

49 n += 1

50 time.sleep(4)

51 except Exception,e:

52 print e

53 break

54 with codecs.open('pjl_comment.txt','a',encoding='utf-8') as f:

55 f.writelines(comments_list)

56 print u"查找到第%d頁,第%d個評論!" %(n,count)

58 # 得到所有關鍵詞

59 def get_all_keywords(file_name):

60 word_lists = [] # 關鍵詞列表

61 with codecs.open(file_name,'r',encoding='utf-8') as f:

62 Lists = f.readlines() # 文本列表

63 for List in Lists:

64 cut_list = list(jieba.cut(List))

65 for word in cut_list:

66 word_lists.append(word)

67 word_lists_set = set(word_lists) # 去除重復元素

68 sort_count = []

69 word_lists_set = list(word_lists_set)

70 length = len(word_lists_set)

71 print u"共有%d個關鍵詞" % length

72 k = 1

73 for w in word_lists_set:

74 sort_count.append(w+u':'+unicode(word_lists.count(w))+u"次\n")

75 print u"%d---" % k + w+u":"+unicode(word_lists.count(w))+ u"次"

76 k += 1

77 with codecs.open('count_word.txt','w',encoding='utf-8') as f:

78 f.writelines(sort_count)

80 def get_top_keywords(file_name):

81 top_word_lists = [] # 關鍵詞列表

82 with codecs.open(file_name,'r',encoding='utf-8') as f:

83 texts = f.read() # 讀取整個文件作為一個字符串

84 Result = analyse.textrank(texts,topK=20,withWeight=True,withFlag=True)

85 n = 1

86 for result in Result:

87 print u"%d:" % n ,

88 for C in result[0]: # result[0] 包含關鍵詞和詞性

89 print C,u" ",

90 print u"權重:"+ unicode(result[1]) # 關鍵詞權重

91 n += 1

93 # 繪制詞雲

94 def draw_wordcloud():

95 with codecs.open('pjl_comment.txt',encoding='utf-8') as f:

96 comment_text = f.read()

97 cut_text = " ".join(jieba.cut(comment_text)) # 將jieba分詞得到的關鍵詞用空格連接成為字符串

98 d = path.dirname(__file__) # 當前文件文件夾所在目錄

99 color_mask = imread("F:/python2.7work/wordcloud/alice_color.png") # 讀取背景圖片

100 cloud = WordCloud(font_path=path.join(d,'simsun.ttc'),background_color='white',mask=color_mask,max_words=2000,max_font_size=40)

101 word_cloud = cloud.generate(cut_text) # 產生詞雲

102 word_cloud.to_file("pjl_cloud.jpg")

103

104

105

106 if __name__ == '__main__':

107 '''

108 url = 'https://movie.douban.com/subject/26630781/comments?start=10581&limit=20&sort=new_score'

109 get_douban_comments(url)

110 file_name = 'pjl_comment.txt'

111 get_top_keywords(file_name)

112 '''

113 draw_wordcloud()

免責聲明！

本站轉載的文章為個人學習借鑒使用，本站對版權不負任何法律責任。如果侵犯了您的隱私權益，請聯系本站郵箱yoyou2525@163.com刪除。

猜您在找 Python 爬取熱詞並進行分類數據分析-[雲圖制作+數據導入] Python生成詞雲圖 python—帶形狀的詞雲圖 python詞雲圖之WordCloud Python爬蟲b站視頻彈幕並生成詞雲圖分析 Python——爬取電影影評評論 python中實現詞雲圖 python 數據分析--詞雲圖，圖形可視化美國競選辯論 python生成詞雲圖（英文版） python抓取電影<海王>影評詞雲生成