import jieba from collections import Counter from wordcloud import WordCloud import matplotlib.pyplot as plt from PIL import Image import numpy as np import jieba.analyse from pyquery import PyQuery santi_text = open('./santi.txt', 'r', encoding='utf-8').read() #讀取本地文檔 jieba.enable_parallel(4) # 開啟並行分詞模式,參數為並行進程數 jieba.load_userdict('./userdict.txt')#加載外部 用戶詞典 # 創建停用詞list def stopwordslist(filepath): stopwords = [line.strip() for line in open(filepath, 'r', encoding='utf-8').readlines()] return stopwords # 對句子去除停用詞 def movestopwords(sentence): stopwords = stopwordslist('./stop_words.txt') # 這里加載停用詞的路徑 santi_words =[x for x in sentence if len(x) >1 and x not in stopwords] return santi_words def main(): words = jieba.cut(PyQuery(santi_text).text()) #去除HTML標簽 word_list = movestopwords(words) # 去除停用詞 words_split = " ".join(word_list) #列表解析為字符串 print('以下是tf-tdf算法-------------------------------------------------') keywords_tf = jieba.analyse.extract_tags(words_split, topK=100, withWeight=True,allowPOS=('ns', 'n', 'vn', 'v')) # tf-tdf算法 for item in keywords_tf: print(item[0],item[1]) print('以下是textrank算法-------------------------------------------------') keywords_rank = jieba.analyse.textrank(words_split, topK=100, withWeight=True,allowPOS=('ns', 'n', 'vn', 'v')) #textrank算法 for item in keywords_rank: print(item[0],item[1]) print('以下是純詞頻統計-------------------------------------------------') mycount = Counter(word_list) # 統計詞頻 for key, val in mycount.most_common(100): # 有序(返回前10個) print(key, val) #alice_mask = np.array(Image.open("./zhihu.png")) #遮罩 wc = WordCloud( # width=800, # height=600, background_color="#000000", # 設置背景顏色 max_words=50, # 詞的最大數(默認為200) max_font_size=400, # 最大字體尺寸 min_font_size=10, # 最小字體尺寸(默認為4) #colormap='bone', # string or matplotlib colormap, default="viridis" random_state=42, # 設置有多少種隨機生成狀態,即有多少種配色方案 #mask=plt.imread("./zhihu.png"), # 讀取遮罩圖片!! #mask=alice_mask, #設置遮罩 font_path='./SimHei.ttf' ) my_wordcloud = wc.generate(words_split) #按詞頻生成詞雲 plt.imshow(my_wordcloud) #展示詞雲 plt.axis("off") #去除橫縱軸 plt.show() wc.to_file('zzz.png') # 保存圖片文件 if __name__ == '__main__': main()