【Python】文本詞頻統計


 

哈姆雷特英文

 

 https://python123.io/resources/pye/hamlet.txt

三國演義中文

https://python123.io/resources/pye/threekingdoms.txt

 

 

 哈姆雷特英文詞頻分析

def getText():
    txt=open("hamlet.txt","r").read()#打開文本,輸入具體的文本路徑
    txt=txt.lower()#將文本中所有的英文字符變成小寫
    for ch in '!"#$%&()*+,-./;:<=>?@[\\]^‘_{|}~':
        txt=txt.replace(ch," ")
    return txt #去掉特殊符號
hamletTxt=getText()#調用函數對文本進行處理
words=hamletTxt.split()#進行列表
counts={}#字典
for word in words:
    counts[word]=counts.get(word,0)+1#獲取到的詞在字典中尋找如果有的話在原來的基礎上+1,如果沒有就收錄到字典中
items=list(counts.items())#變成列表類型
items.sort(key=lambda x:x[1],reverse=True)#對列表排序
for i in range(10):#將出現次數前10的單詞輸出並輸出出現次數
    word,count=items[i]
    print("{0:<10}{1:>5}".format(word,count))

 

 

 三國演義人物出場次數

import jieba#引入jieba分詞庫
txt = open("threekingdoms.txt", "r", encoding="utf-8").read()#打開文本
words = jieba.lcut(txt)#進行分詞處理並形成列表
counts = {}#構造字典,逐一遍歷words中的中文單詞進行處理,並用字典計數
for word in words:
    if len(word) == 1:
        continue
    else:
        counts[word] = counts.get(word, 0) + 1
items = list(counts.items())#轉換列表類型並排序
items.sort(key=lambda x:x[1], reverse=True)
for i in range(15):#輸出前15位單詞
    word, count = items[i]
    print("{0:<10}{1:<5}".format(word, count))

結果:

 

 上面有不是人物的詞,需要改造

import jieba
txt = open("threekingdoms.txt", "r", encoding="utf-8").read()
excludes = {"將軍", "卻說", "荊州", "二人", "不可", "不能", "如此", "主公",\
            "軍士", "商議", "如何", "左右", "軍馬", "引兵", "次日", "大喜",\
            "天下", "東吳", "於是", "今日", "不敢", "魏兵", "陛下", "一人",\
            "都督", "人馬", "不知"}#排除不是人名的詞匯,加到這個排除詞庫中
words = jieba.lcut(txt)
counts = {}
for word in words:#進行人名關聯,防止重復
    if len(word) == 1:
        continue
    elif word == "諸葛亮" or word == "孔明曰":
        rword = "孔明"
    elif word == "關公" or word == "雲長":
        rword = "關羽"
    elif word == "玄德" or word == "玄德曰":
        rword = "劉備"
    elif word == "孟德" or word == "丞相":
        rword = "曹操"
    else:
        rword = word
    counts[rword] = counts.get(rword, 0) + 1
for word in excludes:
    del counts[word]
items = list(counts.items())
items.sort(key=lambda x:x[1], reverse=True)
for i in range(10):
    word, count = items[i]
    print("{0:<10}{1:<5}".format(word, count))

結果

 

不斷優化。。。。

 

 

 

 


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM