jieba分詞以及wordcloud詞雲


1.從網上下載一份 天龍八部的txt文檔以及一份通用的jieba停用詞表

 

2.下載一個背景  圖片.jpg

 

3.檢查一個字體文件   C:/Windows/Fonts/simsun.ttc

 

# -*- coding:utf-8 -*-
import jieba
import jieba.analyse
from PIL import Image
import  numpy as np
from wordcloud import WordCloud,ImageColorGenerator
import  matplotlib.pyplot as plt
#中文分詞,將 天龍八部.txt 文檔 除去停用詞進行分詞,將分詞結果導入天龍八部分詞.txt
stopwords= [line.strip() for line in open("./停用詞表.txt",encoding="utf-8")]
def seg_sentence(sentence):
    sentence_seged = [word for word in jieba.cut(sentence.strip()) if (word not in stopwords and word != '\t') ]
    result = ' '.join(sentence_seged)
    return result
outputs = open("天龍八部分詞.txt","w",encoding='utf-8')
for line in open("./天龍八部.txt",'r',encoding='GB18030'):
    line_seg = seg_sentence(line)
    outputs.write(line_seg+'\n')
outputs.close()


#采用TF-IDF算法進行關鍵詞提取,返回關鍵詞及IF-IDF權重
text = open("./天龍八部分詞.txt",encoding="utf-8").read()
result = jieba.analyse.extract_tags(text,topK=20,withWeight=True,allowPOS=('nr',))
print (result)

#將結果[('段譽', 0.5881865046044787), ('蕭峰', 0.4631424402591722).....]裝換為字典做 詞雲模塊的輸入
keywords = dict()
for i in result:
    keywords[i[0]]=i[1]
    
#詞雲背景
image = Image.open('./圖片.jpg')
graph = np.array(image)
wc = WordCloud(font_path='C:/Windows/Fonts/simsun.ttc',
               background_color ="White",
               max_words=15,
               mask= graph)
#生成詞雲
wc.generate_from_frequencies(keywords)
plt.imshow(wc)
image_color = ImageColorGenerator(graph)
plt.axis("off")
plt.show()
wc.to_file('詞雲.jpg')

 


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM