python 繪制詞雲圖


1. 先下載並安裝nltk包,准備一張簡單的圖片存入代碼所在文件目錄,搜集英文停用詞表

import nltk
nltk.download()

2. 繪制詞雲圖

import re
import numpy as np
import pandas as pd
#import matplotlib
import matplotlib.pyplot as plt
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from PIL import Image
from wordcloud import WordCloud
from sklearn.datasets import fetch_20newsgroups
#from sklearn.feature_extraction.text import CountVectorizer
from collections import Counter, defaultdict

def word_cut(contents, cut=','):
    res = []
    for content in contents:
        content = content.lower()
        words = [word for word in re.split(cut, content) if word]
        res.append(words)
    return res
    
def word_count(contents):
    #words_count = Counter(sum(contents,[]))  #慢
    word_count_dict = defaultdict(lambda: 0)
    for content in contents:
        temp_dict = Counter(content)
        for key in temp_dict:
            word_count_dict[key] += temp_dict[key]
    return word_count_dict

def stopwords_filter(contents, stopwords):
    contents_clean = []
    word_count_dict = defaultdict(lambda: 0)
    for line in contents:
        line_clean = []
        for word in line:
            if word in stopwords:
                continue
            line_clean.append(word)
            word_count_dict[word] += 1
        contents_clean.append(line_clean)
        
    words_count = list(word_count_dict.items())
    words_count.sort(key=lambda x:-x[1])
    words_count = pd.DataFrame(words_count, columns=['word', 'count'])
    return contents_clean, words_count

# 從外部導入數據
'''
df_news = pd.read_table('val.txt', names=['category','theme','URL','content'], encoding='utf-8')
stopwords = pd.read_csv("stopwords.txt", index_col = False, sep="\t", 
                        quoting=3, names=['stopword'], encoding='utf-8')
contents = df_news.content.values.tolist()
stopwords = stopwords.stopword.values.tolist()'''

# 自定義切詞
'''
#[ ,.\n\t--\':;?!/+<>@]
#[ ,.\n\t=--\'`_\[\]:;?!^/|+<>{}@~\\]
#contents = word_cut(contents=news.data, cut='[ ,.\n\t-\`_\[\]:;?!\^/|+<>{}@~]')
'''
# 將數據整理為模型入參形式
'''
#vec = CountVectorizer()
#X_train = vec.fit_transform(X_train)  #不可直接將vec用在測試集上
#vectorizer_test = CountVectorizer(vocabulary=vec.vocabulary_)
#X_test = vectorizer_test.transform(X_test)
'''

# 可從中篩選停用詞
'''
word_count_dict = word_count(contents)
temp = list(word_count_dict.items())
temp.sort(key=lambda x:-x[1])
df = pd.DataFrame(temp, columns=['word','count'])
df.to_csv(r'D:\PycharmProjects\zsyb\stop_words.csv')
'''

# 調包實現上述功能
news = fetch_20newsgroups(subset='all')
# 自定義的快好幾倍,可以加if not in ‘’去標點
contents = [word_tokenize(content.lower()) for content in news.data]  #sent_tokenize(content)
punctuations = set(list(',.\n\t-\`_()\[\]:;?!$#%&.*=\^/|+<>{}@~'))  #標點
digits = {str(i) for i in range(50)}
others = {'--', "''", '``', "'", '...'}
# 下載網上的停用詞表加入 nltk_data\corpora\stopwords,低頻詞過濾(不要加入停用詞)
stopWords = set(stopwords.words('english')) | punctuations | digits | others
contents_clean, words_count = stopwords_filter(contents, stopWords)
#df.groupby(by=['word']).agg({"count": np.size})

# 繪制詞雲圖
fontpath = 'simhei.ttf'
aimask = np.array(Image.open(r"D:\PycharmProjects\zsyb\pig.png"))

wc = WordCloud(font_path = fontpath,          #設置字體
               background_color = "white",    #背景顏色
               max_words = 1000,              #詞雲顯示的最大詞數
               max_font_size = 100,           #字體最大值
               min_font_size = 10,            #字體最小值
               random_state = 42,             #隨機數
               collocations = False,          #避免重復單詞
               mask = aimask,                 #造型遮蓋
               width = 1200, height = 800,    #圖像寬高,需配合plt.figure(dpi=xx)放縮才有效
               margin = 2                     #字間距
               )
word_frequence = {x[0]:x[1] for x in words_count.head(100).values}
word_cloud=wc.fit_words(word_frequence)

plt.figure(dpi=100) #通過這里可以放大或縮小
plt.subplot(121)
plt.imshow(aimask)
#plt.axis("off")  #隱藏坐標
plt.subplot(122)
plt.imshow(word_cloud)
#plt.axis("off")  #隱藏坐標

 


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM