1 # -*- coding:utf-8 -*-
2 '''
3 抓取豆瓣電影某部電影的評論
4抓取電影
5 網址鏈接:https://movie.douban.com/subject/26630781/comments
6 為了抓取全部評論需要先進行登錄
7 '''
8 from selenium import webdriver
9 import time
10 import codecs
11 import jieba
12 import jieba.analyse as analyse
13 from wordcloud import WordCloud
14 from scipy.misc import imread
15 from os import path
16
17 def get_douban_comments(url):
18 comments_list = [] # 評論列表
19 login_url = 'https://accounts.douban.com/login?source=movie'
20 user_name = '1111111' # 這里替換成你的豆瓣用戶名
21 password = '11111111' # 這里替換成你的密碼
22 driver = webdriver.Firefox() # 啟動Firefox()
23 driver.get(login_url)
24 driver.find_element_by_id('email').clear() # 清除輸入框
25 driver.find_element_by_id('email').send_keys(user_name) # 輸入用戶名
26 driver.find_element_by_id('password').clear()
27 driver.find_element_by_id('password').send_keys(password) # 輸入密碼
28 captcha_field = raw_input('請打開瀏覽器輸入驗證碼:') # 手動填入驗證碼
29 driver.find_element_by_id('captcha_field').send_keys(captcha_field)
30 driver.find_element_by_class_name('btn-submit').click() # 點擊登錄按鈕
31 time.sleep(5) # 等待跳轉到登錄之后的頁面
32 driver.get(url) # 定位到目標頁面
33 driver.implicitly_wait(3) # 智能等待3秒
34 n = 501 # 頁數
35 count = 10000 # 評論數目
36 while True:
37 try:
38 results = driver.find_elements_by_class_name('comment')
39 for result in results:
40 # author = result.find_elements_by_tag_name('a')[1].text # 作者
41 # vote = result.find_element_by_class_name('comment-vote').find_element_by_tag_name('span').text # 贊同數目
42 # time0 = result.find_element_by_class_name('comment-info').find_elements_by_tag_name('span')[1].text # 時間
43 comment = result.find_element_by_tag_name('p').text # 評論內容
44 comments_list.append(comment+u'\n')
45 print u"查找到第%d個評論" % count
46 count += 1
47 driver.find_element_by_class_name('next').click() # 點擊下一頁
48 print u'第%d頁查找完畢!' % n
49 n += 1
50 time.sleep(4)
51 except Exception,e:
52 print e
53 break
54 with codecs.open('pjl_comment.txt','a',encoding='utf-8') as f:
55 f.writelines(comments_list)
56 print u"查找到第%d頁,第%d個評論!" %(n,count)
57
58 # 得到所有關鍵詞
59 def get_all_keywords(file_name):
60 word_lists = [] # 關鍵詞列表
61 with codecs.open(file_name,'r',encoding='utf-8') as f:
62 Lists = f.readlines() # 文本列表
63 for List in Lists:
64 cut_list = list(jieba.cut(List))
65 for word in cut_list:
66 word_lists.append(word)
67 word_lists_set = set(word_lists) # 去除重復元素
68 sort_count = []
69 word_lists_set = list(word_lists_set)
70 length = len(word_lists_set)
71 print u"共有%d個關鍵詞" % length
72 k = 1
73 for w in word_lists_set:
74 sort_count.append(w+u':'+unicode(word_lists.count(w))+u"次\n")
75 print u"%d---" % k + w+u":"+unicode(word_lists.count(w))+ u"次"
76 k += 1
77 with codecs.open('count_word.txt','w',encoding='utf-8') as f:
78 f.writelines(sort_count)
79
80 def get_top_keywords(file_name):
81 top_word_lists = [] # 關鍵詞列表
82 with codecs.open(file_name,'r',encoding='utf-8') as f:
83 texts = f.read() # 讀取整個文件作為一個字符串
84 Result = analyse.textrank(texts,topK=20,withWeight=True,withFlag=True)
85 n = 1
86 for result in Result:
87 print u"%d:" % n ,
88 for C in result[0]: # result[0] 包含關鍵詞和詞性
89 print C,u" ",
90 print u"權重:"+ unicode(result[1]) # 關鍵詞權重
91 n += 1
92
93 # 繪制詞雲
94 def draw_wordcloud():
95 with codecs.open('pjl_comment.txt',encoding='utf-8') as f:
96 comment_text = f.read()
97 cut_text = " ".join(jieba.cut(comment_text)) # 將jieba分詞得到的關鍵詞用空格連接成為字符串
98 d = path.dirname(__file__) # 當前文件文件夾所在目錄
99 color_mask = imread("F:/python2.7work/wordcloud/alice_color.png") # 讀取背景圖片
100 cloud = WordCloud(font_path=path.join(d,'simsun.ttc'),background_color='white',mask=color_mask,max_words=2000,max_font_size=40)
101 word_cloud = cloud.generate(cut_text) # 產生詞雲
102 word_cloud.to_file("pjl_cloud.jpg")
103
104
105
106 if __name__ == '__main__':
107 '''
108 url = 'https://movie.douban.com/subject/26630781/comments?start=10581&limit=20&sort=new_score'
109 get_douban_comments(url)
110 file_name = 'pjl_comment.txt'
111 get_top_keywords(file_name)
112 '''
113 draw_wordcloud()
