python爬取《三國演義》小說&統計詞頻&生成詞雲圖


 
        

python爬取《三國演義》小說&統計詞頻&生成詞雲圖

注意點:

  1. 爬取小說正文時用的正則表達式涉及到多行匹配。需要開啟多行模式(?s) book_content_re = re.compile(r'(?s)<div.*?id="htmlContent">(.*?)</div>')

image.png

image.png

image.png

image.png

源代碼練習

 
import re
 import requests
 import jieba
 import jieba.analyse
 import codecs
 from collections import Counter
 import wordcloud
 import matplotlib.pyplot as plt
 ​
 # 要爬取的網址url
 url = 'http://www.janpn.com/book/sanguoyanyi2.html'def get_content(url):
 ​
     # 先獲取二進制數據,再進行編碼
 ​
     txt = requests.get(url).content.decode('utf-8')
 ​
     # print(txt)
 ​
     book_title = re.compile(r'<h3 class="bookTitle">(.+)</h3>')
     # m1.findall(txt)得到的是一個數組,取m1.findall(txt)[0]
     # print(m1.findall(txt))
# book_chapters_re = re.compile(r'<li><a href="(.+\.html)">([第|回].+)</a></li>')
     # book_chapters_re = re.compile(r'<ul class="panel-body panel-chapterlist"><li><a href="(.+)">(.*)</a></li></ul>')
     # 一定要注意   要使用非貪婪模式  來匹配特定結尾.html
     book_chapters_re = re.compile(r'<li><a href="(.*?\.html)".*?>(.+?)</a></li>')
 ​
     book_chapters = book_chapters_re.findall(txt)
     # 開啟多行模式  正文是很多段落的
     book_content_re = re.compile(r'(?s)<div.*?id="htmlContent">(.*?)</div>')
 ​
     # 過濾正文的\r\n
     m3 = re.compile(r'\r\n')
     # 過濾正文的&nbsp;
     m4 = re.compile(r'&nbsp;')
     # 過濾正文的<br />
     m5 = re.compile(r'<br />')
 ​
     print(book_chapters)
     # print(book_chapters)
     with open('三國演義.txt','a') as f:
         for i in book_chapters:
             print([i[0],i[1]])
         # print(book_chapters)
             print(i[0])
             i_url = i[0]
             print("正在下載--->%s" % i[1])
         # 根據每個章節的url,先獲取二進制,再編碼
             content_html = requests.get(i_url).content.decode('utf-8')
 #             匹配正文
             content = book_content_re.findall(content_html)[0]
             print(content)
             content = m3.sub('',content)
             content = m4.sub('',content)
             content = m5.sub('',content)
             print(content)
             f.write('\n'+i[1]+'\n')
             f.write(content)
 ​
 ​
 ​
 # =================================================
# 創建停用詞列表
 def stopwordlist():
     stopwords = [line.strip() for line in open('../結巴分詞/hit_stopwords.txt',encoding='UTF-8').readline()]
     return stopwords
 ​
 ​
 # 對句子進行中文分詞 並 去停用詞
 def seg_depart(sentence):
     print('正在分詞')
     sentence_depart = jieba.cut(sentence.strip())
 ​
 #     創建一個停用詞列表
     stopwords = stopwordlist()
 #     輸出結果為outstr
     outstr = ''#     去停用詞
     for word in sentence_depart:
         if word not in stopwords:
             if word != '\t':
                 outstr += word
                 outstr += ' '
     return outstr
 ​
 # 讀取文件並生成詞雲圖
 filepath = '三國演義.txt'
 def create_word_cloud(filepath):
 ​
     #讀取文件內容
     content = codecs.open(filepath,'r','gbk').read()
 ​
 #     去停用詞
     content = seg_depart(content)
 ​
 #     結巴分詞
     wordlist = jieba.cut(content)
     wl = ' '.join(wordlist)
 ​
     print(wl)
 ​
 #     配置詞雲圖
     wc = wordcloud.WordCloud(
     #     這只背景顏色
         background_color='white',
     #     設置最大顯示的詞數
         max_words=100,
     #     設置字體路徑
         font_path = 'C:\Windows\Fonts\msyh.ttc',
     #
         height=1200,
         width=1600,
     #     設置字體最大值
         max_font_size=300,
     #     設置有多少種隨機配色方案
         random_state=50
     )
 ​
 #     生成詞雲圖
     myword = wc.generate(wl)
 ​
 #     展示詞雲圖
     plt.imshow(myword)
     plt.axis('off')
     plt.show()
 ​
 ​
 ​
 # =================================================
# 開啟爬蟲
 # get_content(url)
# 生成詞雲圖
 create_word_cloud(filepath)
 ​
 # ===================================================
 ​
 ​
 # 讀取文件  詞頻統計
 def count_from_file(filepath,top_limit=0):
     with codecs.open(filepath,'r','gbk') as f:
         content = f.read()
 ​
 #         將多個空格替換為一個空格
         content = re.sub(r'\s+',r' ',content)
         content = re.sub(r'\.+',r' ',content)
 #         去停用詞
         content = seg_depart(content)
 ​
         return count_from_str(content)
 ​
 def count_from_str(content,top_limit=0): 
    if top_limit<=0: 
        top_limit=100#         提取文章的關鍵詞 
    tags = jieba.analyse.extract_tags(content,topK=100) 
    print("關鍵詞:") 
    print(tags) 
​ 
    words = jieba.cut(content) 
​ 
    counter = Counter() 
​ 
    for word in words: 
        if word in tags: 
            counter[word]+=1return  counter.most_common(top_limit) 
​ 
​ 
​ 
# ===================================== 
print("打印詞頻統計") 
​ 
# 打印詞頻統計 
result = count_from_file(filepath) 
print(result) 
​ 
​ 
def test(url): 
​ 
    # 開啟多行匹配模式  因為如果涉及到換行符   就要用多行 
    book_content_re = re.compile(r'(?s)<div.*?id="htmlContent">(.*?)</div>') 
    content_html = requests.get(url).content.decode('gbk') 
    print(content_html) 
    content = book_content_re.findall(content_html) 
    print(content) 
​ 
# test("http://www.janpn.com/book/171/171182/35225767.html")

 


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM