python 可視化 詞雲圖


 

 

我的代碼:

 

# -*- coding: utf-8 -*-
from pandas import read_csv
import numpy as np
from sklearn.datasets.base import Bunch
import pickle  # 導入cPickle包並且取一個別名pickle #持久化類
from sklearn.feature_extraction.text import TfidfVectorizer
import jieba
import operator  # 排序用
from sklearn import metrics
from sklearn.externals import joblib
import xlwt
#導入wordcloud模塊和matplotlib模塊
import wordcloud
import matplotlib.pyplot as plt
from scipy.misc import imread

        
'''讀取停用詞'''
def _readfile(path):
    with open(path, "rb") as fp:
        content = fp.read()
    return content


''' 讀取bunch對象'''


def _readbunchobj(path):
    with open(path, "rb") as file_obj:
        bunch = pickle.load(file_obj)
    return bunch


'''寫入bunch對象'''


def _writebunchobj(path, bunchobj):
    with open(path, "wb") as file_obj:
        pickle.dump(bunchobj, file_obj)

def buildtestbunch(bunch_path, art_test):
    bunch = Bunch(contents=[])#label=[],
# =============================================================================
#     for item1 in testlabel:
#         bunch.label.append(item1)
# =============================================================================

    # testContentdatasave=[] #存儲所有訓練和測試數據的分詞
    for item2 in art_test:
        item2 = str(item2)
        item2 = item2.replace("\r\n", "")
        item2 = item2.replace(" ", "")
        content_seg = jieba.cut(item2)
        save2 = ''
        for item3 in content_seg:
            if len(item3) > 1 and item3 != '\r\n':
                # testContentdatasave.append(item3)
                save2 = save2 + "," + item3
        bunch.contents.append(save2)
    with open(bunch_path, "wb") as file_obj:
        pickle.dump(bunch, file_obj)
    print("構建測試數據文本對象結束!!!")


def vector_space(stopword_path, bunch_path, space_path):
    stpwrdlst = _readfile(stopword_path).splitlines()  # 讀取停用詞
    bunch = _readbunchobj(bunch_path)  # 導入分詞后的詞向量bunch對象
    # 構建tf-idf詞向量空間對象
    tfidfspace = Bunch(label=bunch.label, tdm=[], vocabulary={})

    # 權重矩陣tdm,其中,權重矩陣是一個二維矩陣,tdm[i][j]表示,第j個詞(即詞典中的序號)在第i個類別中的IF-IDF值

    # 使用TfidVectorizer初始化向量空間模型
    vectorizer = TfidfVectorizer(stop_words=stpwrdlst, sublinear_tf=True, max_df=0.5, min_df=0.0001, use_idf=True,
                                 max_features=15000)
    # print(vectorizer)
    # 文本轉為詞頻矩陣,單獨保存字典文件
    tfidfspace.tdm = vectorizer.fit_transform(bunch.contents)
    tfidfspace.vocabulary = vectorizer.vocabulary_
    # 創建詞袋的持久化
    _writebunchobj(space_path, tfidfspace)
    print("if-idf詞向量空間實例創建成功!!!")


def testvector_space(stopword_path, bunch_path, space_path, train_tfidf_path):
    stpwrdlst = _readfile(stopword_path).splitlines()  # 把停用詞變成列表
    bunch = _readbunchobj(bunch_path)
    tfidfspace = Bunch(tdm=[], vocabulary={})#label=bunch.label, 
    # 導入訓練集的TF-IDF詞向量空間  ★★
    trainbunch = _readbunchobj(train_tfidf_path)
    tfidfspace.vocabulary = trainbunch.vocabulary

    vectorizer= TfidfVectorizer(stop_words=stpwrdlst, sublinear_tf=True, max_df=0.7, vocabulary=trainbunch.vocabulary,
                                 min_df=0.001)
     
    
    tfidfspace.tdm = vectorizer.fit_transform(bunch.contents)
    _writebunchobj(space_path, tfidfspace)
    print("if-idf詞向量空間實例創建成功!!!")

if __name__=="__main__":  
    
    
    Sdata = []
    art = []
    '''============================先導入數據=================================='''
    file_test = 'F:/goverment/text analyse/type_in.csv'

    dataset = read_csv(file_test)
    Sdata = dataset.values[:, :]
    Sdata=Sdata.tolist()
    for line in Sdata:
        art.append(line[1])#line[1]為文本
    print(len(Sdata))
    
    '''==========================================================tf-idf對Bar進行文本特征提取============================================================================'''
    # 導入分詞后的詞向量bunch對象
    test_bunch_path = "F:/goverment/text analyse/trainbunch.bat"
    test_space_path = "F:/goverment/text analyse/traintfdifspace.dat"
    stopword_path = "F:/goverment/text analyse/hlt_stop_words.txt"

    '''============================================================tf-idf對Sart進行文本特征提取=============================================================================='''

    buildtestbunch(test_bunch_path, art)

    testvector_space(stopword_path, test_bunch_path, test_space_path, test_space_path)

    test_set = _readbunchobj(test_space_path)

    '''測試數據'''

    
    #獲取已知 id 找 文本
    txtcut=[] #存放所有詞
    dic={}
    for i in test_set.vocabulary.keys():
        txtcut.append(i)
        dic[test_set.vocabulary[i]]=i
        

    #print(dic)
    
    #print(test_set.tdm)
    #print(test_set.tdm[0])
    #print(dir(test_set))
    #print(test_set.vocabulary)
    #print(dir(test_set.tdm))
    
    #print(Sdata)
    
    #print(nonzero[1])
  
    '''final里放的是不超過15的詞'''
    #print(Sdata)
    final=[]
    for k in range(len(Sdata)):#遍歷每一條文本
        nonzero=test_set.tdm[k].nonzero()
        ls=[]
        ls.append(Sdata[k][0])
        num=0
        for i in range(len(nonzero[1])):
            num=num+1
            b=test_set.tdm[k, nonzero[1][i]]*100 #test_set.tdm[k, nonzero[1][i]]是第k條文本中,第i個權重非零的詞權重
            a= dic[nonzero[1][i]] +"  "+str(round(b,2))+"%"
            ls.append(a)
            if num==15:
                break
        final.append(ls)
    
    '''畫詞雲圖'''
    fig = plt.figure(figsize = (15,15))
    cloud = wordcloud.WordCloud(font_path='STXINGKA.TTF',mask=imread('water3.png'),mode='RGBA',
                                background_color=None).generate(' '.join(txtcut))
    img = imread('water3.png')
    cloud_colors = wordcloud.ImageColorGenerator(np.array(img))
    cloud.recolor(color_func=cloud_colors)
    plt.imshow(cloud)
    plt.axis('off')
    plt.savefig('watercloud3.png',dpi=400)
    plt.show()
    
    myexcel = xlwt.Workbook()
    sheet = myexcel.add_sheet("sheet1")
    si=-1
    sj=-1
    for line in final:
        si=si+1
        sj=-1
        for i in line:
            sj=sj+1
            sheet.write(si,sj,str(i))
    
    myexcel.save("各條分詞.xls")
    
    
    #把id存好
    myexcel = xlwt.Workbook()
    sheet = myexcel.add_sheet("sheet2")
    p=0
    for i in test_set.vocabulary.keys():
        sheet.write(p,0,i)
        print(i)
        sheet.write(p,1,str(test_set.vocabulary[i]))
        p=p+1
            
    myexcel.save("詞匯id.xls")

 

 各條分詞:

 

 

 詞匯id:

 


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM