1.從網上下載一份 天龍八部的txt文檔以及一份通用的jieba停用詞表
2.下載一個背景 圖片.jpg
3.檢查一個字體文件 C:/Windows/Fonts/simsun.ttc
# -*- coding:utf-8 -*- import jieba import jieba.analyse from PIL import Image import numpy as np from wordcloud import WordCloud,ImageColorGenerator import matplotlib.pyplot as plt #中文分詞,將 天龍八部.txt 文檔 除去停用詞進行分詞,將分詞結果導入天龍八部分詞.txt stopwords= [line.strip() for line in open("./停用詞表.txt",encoding="utf-8")] def seg_sentence(sentence): sentence_seged = [word for word in jieba.cut(sentence.strip()) if (word not in stopwords and word != '\t') ] result = ' '.join(sentence_seged) return result outputs = open("天龍八部分詞.txt","w",encoding='utf-8') for line in open("./天龍八部.txt",'r',encoding='GB18030'): line_seg = seg_sentence(line) outputs.write(line_seg+'\n') outputs.close() #采用TF-IDF算法進行關鍵詞提取,返回關鍵詞及IF-IDF權重 text = open("./天龍八部分詞.txt",encoding="utf-8").read() result = jieba.analyse.extract_tags(text,topK=20,withWeight=True,allowPOS=('nr',)) print (result) #將結果[('段譽', 0.5881865046044787), ('蕭峰', 0.4631424402591722).....]裝換為字典做 詞雲模塊的輸入 keywords = dict() for i in result: keywords[i[0]]=i[1] #詞雲背景 image = Image.open('./圖片.jpg') graph = np.array(image) wc = WordCloud(font_path='C:/Windows/Fonts/simsun.ttc', background_color ="White", max_words=15, mask= graph) #生成詞雲 wc.generate_from_frequencies(keywords) plt.imshow(wc) image_color = ImageColorGenerator(graph) plt.axis("off") plt.show() wc.to_file('詞雲.jpg')
