python爬取《三國演義》小說&統計詞頻&生成詞雲圖
注意點:
-
爬取小說正文時用的正則表達式涉及到多行匹配。需要開啟多行模式(?s)
源代碼練習
import re import requests import jieba import jieba.analyse import codecs from collections import Counter import wordcloud import matplotlib.pyplot as plt # 要爬取的網址url url = 'http://www.janpn.com/book/sanguoyanyi2.html' def get_content(url): # 先獲取二進制數據,再進行編碼 txt = requests.get(url).content.decode('utf-8') # print(txt) book_title = re.compile(r'<h3 class="bookTitle">(.+)</h3>') # m1.findall(txt)得到的是一個數組,取m1.findall(txt)[0] # print(m1.findall(txt)) # book_chapters_re = re.compile(r'<li><a href="(.+\.html)">([第|回].+)</a></li>') # book_chapters_re = re.compile(r'<ul class="panel-body panel-chapterlist"><li><a href="(.+)">(.*)</a></li></ul>') # 一定要注意 要使用非貪婪模式 來匹配特定結尾.html book_chapters_re = re.compile(r'<li><a href="(.*?\.html)".*?>(.+?)</a></li>') book_chapters = book_chapters_re.findall(txt) # 開啟多行模式 正文是很多段落的 book_content_re = re.compile(r'(?s)<div.*?id="htmlContent">(.*?)</div>') # 過濾正文的\r\n m3 = re.compile(r'\r\n') # 過濾正文的 m4 = re.compile(r' ') # 過濾正文的<br /> m5 = re.compile(r'<br />') print(book_chapters) # print(book_chapters) with open('三國演義.txt','a') as f: for i in book_chapters: print([i[0],i[1]]) # print(book_chapters) print(i[0]) i_url = i[0] print("正在下載--->%s" % i[1]) # 根據每個章節的url,先獲取二進制,再編碼 content_html = requests.get(i_url).content.decode('utf-8') # 匹配正文 content = book_content_re.findall(content_html)[0] print(content) content = m3.sub('',content) content = m4.sub('',content) content = m5.sub('',content) print(content) f.write('\n'+i[1]+'\n') f.write(content) # ================================================= # 創建停用詞列表 def stopwordlist(): stopwords = [line.strip() for line in open('../結巴分詞/hit_stopwords.txt',encoding='UTF-8').readline()] return stopwords # 對句子進行中文分詞 並 去停用詞 def seg_depart(sentence): print('正在分詞') sentence_depart = jieba.cut(sentence.strip()) # 創建一個停用詞列表 stopwords = stopwordlist() # 輸出結果為outstr outstr = '' # 去停用詞 for word in sentence_depart: if word not in stopwords: if word != '\t': outstr += word outstr += ' ' return outstr # 讀取文件並生成詞雲圖 filepath = '三國演義.txt' def create_word_cloud(filepath): #讀取文件內容 content = codecs.open(filepath,'r','gbk').read() # 去停用詞 content = seg_depart(content) # 結巴分詞 wordlist = jieba.cut(content) wl = ' '.join(wordlist) print(wl) # 配置詞雲圖 wc = wordcloud.WordCloud( # 這只背景顏色 background_color='white', # 設置最大顯示的詞數 max_words=100, # 設置字體路徑 font_path = 'C:\Windows\Fonts\msyh.ttc', # height=1200, width=1600, # 設置字體最大值 max_font_size=300, # 設置有多少種隨機配色方案 random_state=50 ) # 生成詞雲圖 myword = wc.generate(wl) # 展示詞雲圖 plt.imshow(myword) plt.axis('off') plt.show() # ================================================= # 開啟爬蟲 # get_content(url) # 生成詞雲圖 create_word_cloud(filepath) # =================================================== # 讀取文件 詞頻統計 def count_from_file(filepath,top_limit=0): with codecs.open(filepath,'r','gbk') as f: content = f.read() # 將多個空格替換為一個空格 content = re.sub(r'\s+',r' ',content) content = re.sub(r'\.+',r' ',content) # 去停用詞 content = seg_depart(content) return count_from_str(content) def count_from_str(content,top_limit=0): if top_limit<=0: top_limit=100 # 提取文章的關鍵詞 tags = jieba.analyse.extract_tags(content,topK=100) print("關鍵詞:") print(tags) words = jieba.cut(content) counter = Counter() for word in words: if word in tags: counter[word]+=1 return counter.most_common(top_limit) # ===================================== print("打印詞頻統計") # 打印詞頻統計 result = count_from_file(filepath) print(result) def test(url): # 開啟多行匹配模式 因為如果涉及到換行符 就要用多行 book_content_re = re.compile(r'(?s)<div.*?id="htmlContent">(.*?)</div>') content_html = requests.get(url).content.decode('gbk') print(content_html) content = book_content_re.findall(content_html) print(content) # test("http://www.janpn.com/book/171/171182/35225767.html")




