使用wordcloud模塊進行生成詞雲。一般可以生成兩種類型的詞雲:
一、默認圖片生成
import warnings warnings.filterwarnings("ignore") import jieba #分詞包 import numpy #numpy計算包 import codecs #codecs提供的open方法來指定打開的文件的語言編碼,它會在讀取的時候自動轉換為內部unicode import pandas as pd import matplotlib.pyplot as plt %matplotlib inline import matplotlib matplotlib.rcParams['figure.figsize'] = (10.0, 5.0) from wordcloud import WordCloud#詞雲包 #導入娛樂新聞數據,分詞: df = pd.read_csv("./data/entertainment_news.csv", encoding='utf-8') df = df.dropna() content=df.content.values.tolist() #jieba.load_userdict(u"data/user_dic.txt") segment=[] for line in content: try: segs=jieba.lcut(line) for seg in segs: if len(seg)>1 and seg!='\r\n': segment.append(seg) except: print line continue #去停用詞 words_df=pd.DataFrame({'segment':segment}) #words_df.head() stopwords=pd.read_csv("data/stopwords.txt",index_col=False,\
quoting=3,sep="\t",names=['stopword'], encoding='utf-8')#quoting=3全不引用 #stopwords.head() words_df=words_df[~words_df.segment.isin(stopwords.stopword)] #統計詞頻 words_stat=words_df.groupby(by=['segment'])['segment'].agg({"計數":numpy.size}) words_stat=words_stat.reset_index().sort_values(by=["計數"],ascending=False) words_stat.head() #做詞雲 wordcloud=WordCloud(font_path="data/simhei.ttf",background_color="white",\
max_font_size=80) word_frequence = {x[0]:x[1] for x in words_stat.head(1000).values} wordcloud=wordcloud.fit_words(word_frequence) plt.imshow(wordcloud)
效果:
二、生成自定義圖片的詞雲
from scipy.misc import imread matplotlib.rcParams['figure.figsize'] = (15.0, 15.0) from wordcloud import WordCloud,ImageColorGenerator bimg=imread('image/entertainment.jpeg') wordcloud=WordCloud(background_color="white",mask=bimg,font_path='data/simhei.ttf',max_font_size=200) word_frequence = {x[0]:x[1] for x in words_stat.head(1000).values} wordcloud=wordcloud.fit_words(word_frequence) bimgColors=ImageColorGenerator(bimg) plt.axis("off") plt.imshow(wordcloud.recolor(color_func=bimgColors))
效果: