python爬取《三國演義》小說&統計詞頻&生成詞雲圖

本文轉載自查看原文 2020-10-05 13:29 574 爬蟲/ python/ 自然語言處理

python爬取《三國演義》小說&統計詞頻&生成詞雲圖

注意點：

爬取小說正文時用的正則表達式涉及到多行匹配。需要開啟多行模式(?s) book_content_re = re.compile(r'(?s)<div.*?id="htmlContent">(.*?)</div>')

源代碼練習

import re
 import requests
 import jieba
 import jieba.analyse
 import codecs
 from collections import Counter
 import wordcloud
 import matplotlib.pyplot as plt
 
 # 要爬取的網址url
 url = 'http://www.janpn.com/book/sanguoyanyi2.html'
 
 def get_content(url):
 
     # 先獲取二進制數據，再進行編碼
 
     txt = requests.get(url).content.decode('utf-8')
 
     # print(txt)
 
     book_title = re.compile(r'<h3 class="bookTitle">(.+)</h3>')
     # m1.findall(txt)得到的是一個數組，取m1.findall(txt)[0]
     # print(m1.findall(txt))
 
     # book_chapters_re = re.compile(r'<li><a href="(.+\.html)">([第|回].+)</a></li>')
     # book_chapters_re = re.compile(r'<ul class="panel-body panel-chapterlist"><li><a href="(.+)">(.*)</a></li></ul>')
     # 一定要注意   要使用非貪婪模式  來匹配特定結尾.html
     book_chapters_re = re.compile(r'<li><a href="(.*?\.html)".*?>(.+?)</a></li>')
 
     book_chapters = book_chapters_re.findall(txt)
     # 開啟多行模式  正文是很多段落的
     book_content_re = re.compile(r'(?s)<div.*?id="htmlContent">(.*?)</div>')
 
     # 過濾正文的\r\n
     m3 = re.compile(r'\r\n')
     # 過濾正文的&nbsp;
     m4 = re.compile(r'&nbsp;')
     # 過濾正文的<br />
     m5 = re.compile(r'<br />')
 
     print(book_chapters)
     # print(book_chapters)
     with open('三國演義.txt','a') as f:
         for i in book_chapters:
             print([i[0],i[1]])
         # print(book_chapters)
             print(i[0])
             i_url = i[0]
             print("正在下載--->%s" % i[1])
         # 根據每個章節的url，先獲取二進制，再編碼
             content_html = requests.get(i_url).content.decode('utf-8')
 #             匹配正文
             content = book_content_re.findall(content_html)[0]
             print(content)
             content = m3.sub('',content)
             content = m4.sub('',content)
             content = m5.sub('',content)
             print(content)
             f.write('\n'+i[1]+'\n')
             f.write(content)
 
 
 
 # =================================================
 
 # 創建停用詞列表
 def stopwordlist():
     stopwords = [line.strip() for line in open('../結巴分詞/hit_stopwords.txt',encoding='UTF-8').readline()]
     return stopwords
 
 
 # 對句子進行中文分詞 並 去停用詞
 def seg_depart(sentence):
     print('正在分詞')
     sentence_depart = jieba.cut(sentence.strip())
 
 #     創建一個停用詞列表
     stopwords = stopwordlist()
 #     輸出結果為outstr
     outstr = ''
 
 #     去停用詞
     for word in sentence_depart:
         if word not in stopwords:
             if word != '\t':
                 outstr += word
                 outstr += ' '
     return outstr
 
 # 讀取文件並生成詞雲圖
 filepath = '三國演義.txt'
 def create_word_cloud(filepath):
 
     #讀取文件內容
     content = codecs.open(filepath,'r','gbk').read()
 
 #     去停用詞
     content = seg_depart(content)
 
 #     結巴分詞
     wordlist = jieba.cut(content)
     wl = ' '.join(wordlist)
 
     print(wl)
 
 #     配置詞雲圖
     wc = wordcloud.WordCloud(
     #     這只背景顏色
         background_color='white',
     #     設置最大顯示的詞數
         max_words=100,
     #     設置字體路徑
         font_path = 'C:\Windows\Fonts\msyh.ttc',
     #
         height=1200,
         width=1600,
     #     設置字體最大值
         max_font_size=300,
     #     設置有多少種隨機配色方案
         random_state=50
     )
 
 #     生成詞雲圖
     myword = wc.generate(wl)
 
 #     展示詞雲圖
     plt.imshow(myword)
     plt.axis('off')
     plt.show()
 
 
 
 # =================================================
 
 # 開啟爬蟲
 # get_content(url)
 
 # 生成詞雲圖
 create_word_cloud(filepath)
 
 # ===================================================
 
 
 # 讀取文件  詞頻統計
 def count_from_file(filepath,top_limit=0):
     with codecs.open(filepath,'r','gbk') as f:
         content = f.read()
 
 #         將多個空格替換為一個空格
         content = re.sub(r'\s+',r' ',content)
         content = re.sub(r'\.+',r' ',content)
 #         去停用詞
         content = seg_depart(content)
 
         return count_from_str(content)
 
 def count_from_str(content,top_limit=0): 
    if top_limit<=0: 
        top_limit=100 
 
#         提取文章的關鍵詞 
    tags = jieba.analyse.extract_tags(content,topK=100) 
    print("關鍵詞：") 
    print(tags) 
 
    words = jieba.cut(content) 
 
    counter = Counter() 
 
    for word in words: 
        if word in tags: 
            counter[word]+=1 
 
    return  counter.most_common(top_limit) 
 
 
 
# ===================================== 
print("打印詞頻統計") 
 
# 打印詞頻統計 
result = count_from_file(filepath) 
print(result) 
 
 
def test(url): 
 
    # 開啟多行匹配模式  因為如果涉及到換行符   就要用多行 
    book_content_re = re.compile(r'(?s)<div.*?id="htmlContent">(.*?)</div>') 
    content_html = requests.get(url).content.decode('gbk') 
    print(content_html) 
    content = book_content_re.findall(content_html) 
    print(content) 
 
# test("http://www.janpn.com/book/171/171182/35225767.html")

免責聲明！

本站轉載的文章為個人學習借鑒使用，本站對版權不負任何法律責任。如果侵犯了您的隱私權益，請聯系本站郵箱yoyou2525@163.com刪除。

猜您在找 【Python】三國演義詞頻統計 python分析三國演義中出現次數最多的詞作詞頻統計 python實例：三國演義TXT文本詞頻分析三國演義人物出場統計《三國演義》——詞雲制作使用python統計《三國演義》小說里人物出現次數前十名，並實現可視化。《三國演義》回目《三國演義》python爬蟲並分析數據 python分析《三國演義》，誰才是這部書的絕對主角（包含統計指定角色的方法）用python分析《三國演義》中人物數據