相关截图
练习代码
import wordcloud import pandas as pd import jieba import matplotlib.pyplot as plt from nltk.corpus import brown font_path = 'C:\Windows\Fonts\msyh.ttc' text = 'this is shanghai, 李帅, 郭靖, 成龙, 哀牢山 三十六剑' ''' wc = wordcloud.WordCloud( font_path=font_path, max_font_size=300, width=360, height=180, mode='RGBA', background_color=None, #透明的词云 ) cloudobj = wc.generate(text) # cloudobj.show() print(cloudobj) # 展示词云图 plt.imshow(cloudobj) # 关闭坐标轴,否则很丑 plt.axis('off') plt.show() # 保存高清图片 cloudobj.to_file('词云.png') ''' raw = pd.read_table('./金庸-射雕英雄传txt精校版.txt',names=['txt'],encoding='GBK') # print(raw) # 加入章节标识 # 章节判断用变量预处理 def m_head(tmpstr): return tmpstr[:1] #取第一个字 def m_mid(tmpstr): return tmpstr.find("回 ") # 用apply函数将下面的属性加入到对应列 raw['head'] = raw.txt.apply(m_head) raw['mid'] = raw.txt.apply(m_mid) raw['len'] = raw.txt.apply(len) # 章节判断 chapnum = 0 for i in range(len(raw)): if raw['head'][i] == "第" and raw['mid'][i] >0 and raw['len'][i]<30: chapnum += 1 if chapnum >= 40 and raw['txt'][i] == "附录一:成吉思汗家族": chapnum=0 raw.loc[i,'chap'] = chapnum # 删除临时变量 del raw['head'] del raw['mid'] del raw['len'] # 段落聚合 根据章节聚合 rawgrp = raw.groupby('chap') chapter = rawgrp.agg(sum) chapter = chapter[chapter.index != 0] t = chapter.txt[1] print("*"*100) print(t) print("*"*100) ''' 生成射雕英雄传第一章的词云 ''' # 把停用词.txt的内容读入数据框 stoplist = list(pd.read_csv('./停用词.txt',names=['w'],sep='aaa',encoding='utf-8',engine='python').w) # print(stoplist) # print(' '.join(stoplist)) def m_cut(intxt): return [w for w in jieba.cut(intxt) if w not in stoplist] cloudobj = wordcloud.WordCloud( font_path=font_path, width=1200, height=800, mode='RGBA', background_color=None, stopwords=stoplist, ).generate(' '.join(jieba.lcut(chapter.txt[1]))) plt.imshow(cloudobj) plt.axis('off') plt.show() ''' 基于分词频数绘制词云 ''' txt_freq = {'张三':100,'李四':90,'王二麻子':50} cloudobj = wordcloud.WordCloud( font_path=font_path, ).fit_words(txt_freq) plt.imshow(cloudobj) plt.axis("off") plt.show() ''' 基于分词频数绘制射雕英雄传的词云 ''' import nltk from nltk import FreqDist # 去停用词 tokens = m_cut(chapter.txt[1]) # 生成完备的词条频数词典 fdist = FreqDist(tokens) print(type(fdist)) # <class 'nltk.probability.FreqDist'> cloudobj = wordcloud.WordCloud( font_path=font_path, background_color=None, width=1600, height=1000, ).fit_words(fdist) plt.imshow(cloudobj) plt.axis("off") plt.show() ''' 词云的美化: 1,设置背景图片 Mask/遮罩 用于控制词频的整体形状 指定mask后,设置的高和宽江北忽略,遮罩形状被指定图形的形状取代。除全白的部分仍然被保留外, 其余部分会用于绘制词云。因此背景图片的画布一定要设置为白色 字体的大小,布局和颜色也会基于mask生成 必要时需要调整颜色以增强可视效果 # 基本调用方式 from scipy.misc import imread mask = imread('背景图片') ''' from imageio import imread def m_cut2(intxt): return [w for w in jieba.cut(intxt) if w not in stoplist and len(w)>1] cloudobj = wordcloud.WordCloud( font_path=font_path, mask=imread('射雕背景1.png'), mode='RGBA', background_color=None, ).generate(' '.join(m_cut2(chapter.txt[1]))) plt.imshow(cloudobj) plt.axis("off") plt.show()