jieba文本分詞,去除停用詞,添加用戶詞


import jieba
from collections import Counter
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from PIL import Image
import numpy as np
import jieba.analyse
from pyquery import PyQuery

santi_text = open('./santi.txt', 'r', encoding='utf-8').read() #讀取本地文檔

jieba.enable_parallel(4) # 開啟並行分詞模式,參數為並行進程數 

jieba.load_userdict('./userdict.txt')#加載外部 用戶詞典

# 創建停用詞list
def stopwordslist(filepath):
    stopwords = [line.strip() for line in open(filepath, 'r', encoding='utf-8').readlines()]
    return stopwords

# 對句子去除停用詞
def movestopwords(sentence):
    stopwords = stopwordslist('./stop_words.txt')  # 這里加載停用詞的路徑
    santi_words =[x for x in sentence if len(x) >1 and x not in stopwords]

    return santi_words

def main():
    words = jieba.cut(PyQuery(santi_text).text()) #去除HTML標簽
    word_list = movestopwords(words) # 去除停用詞
    words_split = " ".join(word_list) #列表解析為字符串

    print('以下是tf-tdf算法-------------------------------------------------')
    keywords_tf = jieba.analyse.extract_tags(words_split, topK=100, withWeight=True,allowPOS=('ns', 'n', 'vn', 'v')) # tf-tdf算法
    for item in keywords_tf:
         print(item[0],item[1])

    print('以下是textrank算法-------------------------------------------------')
    keywords_rank = jieba.analyse.textrank(words_split, topK=100, withWeight=True,allowPOS=('ns', 'n', 'vn', 'v')) #textrank算法
    for item in keywords_rank:
         print(item[0],item[1])

    print('以下是純詞頻統計-------------------------------------------------')
    mycount = Counter(word_list) # 統計詞頻
    for key, val in mycount.most_common(100):  # 有序(返回前10個)
        print(key, val)

    #alice_mask = np.array(Image.open("./zhihu.png")) #遮罩
    wc = WordCloud(
        # width=800,
        # height=600,
        background_color="#000000",  # 設置背景顏色
        max_words=50,  # 詞的最大數(默認為200)
        max_font_size=400,  # 最大字體尺寸
        min_font_size=10,  # 最小字體尺寸(默認為4)
        #colormap='bone',  # string or matplotlib colormap, default="viridis"
        random_state=42,  # 設置有多少種隨機生成狀態,即有多少種配色方案
        #mask=plt.imread("./zhihu.png"),  # 讀取遮罩圖片!!
        #mask=alice_mask, #設置遮罩
        font_path='./SimHei.ttf'
    )


    my_wordcloud = wc.generate(words_split) #按詞頻生成詞雲
    plt.imshow(my_wordcloud) #展示詞雲
    plt.axis("off") #去除橫縱軸
    plt.show()
    wc.to_file('zzz.png') # 保存圖片文件

if __name__ == '__main__':
    main()

 


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM