機器學習-文本聚類實例-kmeans


機器學習-文本聚類實例-kmeans

import os

import gensim
import jieba

from gensim.models.doc2vec import Doc2Vec
from sklearn.cluster import KMeans

TaggededDocument = gensim.models.doc2vec.TaggedDocument


# 定義兩個函數讀取和保存文件
# 保存至文件
def savefile(savepath, content, encode):
    fp = open(savepath, "w", encoding=encode)
    fp.write(content)
    fp.close()


# 讀文件
def readfile(path, encode):
    content = None
    try:
        fp = open(path, "r", encoding=encode)
        content = fp.read()
        fp.close()
    except UnicodeDecodeError:
        print("Error: 文件讀取失敗")
    else:
        return content


stop_words_path = '/Users/FengZhen/Desktop/accumulate/機器學習/kmeans聚類/中文停用詞表.txt'
origin_text_path = '/Users/FengZhen/Desktop/accumulate/機器學習/kmeans聚類/測試文本集/'
cut_combine_path = '/Users/FengZhen/Desktop/accumulate/機器學習/kmeans聚類/cut_combine.txt'
corpus_text_path = '/Users/FengZhen/Desktop/accumulate/機器學習/kmeans聚類/corpus.txt'
result_text_path = '/Users/FengZhen/Desktop/accumulate/機器學習/kmeans聚類/own_claasify.txt'

# 分詞-去停用詞-保存
def segment():
    stop_words = readfile(stop_words_path,'UTF-8')
    fileList = os.listdir(origin_text_path)
    save = open(cut_combine_path, 'w')
    for file in fileList:
        if not file.startswith("."):
            content_result = ''
            content = readfile(origin_text_path + file, 'GBK')
            content_words = jieba.cut(content)
            for content_word in content_words:
                if content_word not in stop_words:
                    content_result = content_result + " " + content_word
            save.write(content_result.replace('\r','').replace('\n',''))
            save.write('\n')


def get_datasest():
    with open(cut_combine_path, 'r') as cf:
        docs = cf.readlines()
        print
        len(docs)

    x_train = []
    for i, text in enumerate(docs):
        word_list = text.split(' ')
        l = len(word_list)
        word_list[l - 1] = word_list[l - 1].strip()
        # 訓練模型前,先將語料整理成規定的形式,這里用到TaggedDocument模型
        # 輸入輸出內容都為 詞袋 + tag列表
        document = TaggededDocument(word_list, tags=[i])
        x_train.append(document)

    return x_train


def train(x_train, size=200):
    # 初始化訓練模型的參數,再保存訓練結果以釋放內存
    # 提供x_train可初始化, min_cout 忽略總頻率低於這個的所有單詞, window 預測的詞與上下文詞之間最大的距離, 用於預測  size 特征向量的維數 negative 接受雜質的個數 worker 工作模塊數
    model_dm = Doc2Vec(x_train, min_count=1, window=3, size=size, sample=1e-3, negative=5, workers=4)
    # corpus_count是文件個數  epochs 訓練次數
    model_dm.train(x_train, total_examples=model_dm.corpus_count, epochs=100)
    # 保存模型訓練結果,釋放內存空間,后續可用load加載
    model_dm.save(corpus_text_path)
    return model_dm


def cluster(x_train):
    infered_vectors_list = []
    print
    "load doc2vec model..."
    # 加載訓練的模型   model_dm輸出類似Doc2Vec(dm/m,d500,n5,w3,s0.001,t4)
    model_dm = Doc2Vec.load(corpus_text_path)
    print
    "load train vectors..."
    i = 0
    for text, label in x_train:
        vector = model_dm.infer_vector(text)
        infered_vectors_list.append(vector)
        i += 1

    print
    "train kmean model..."
    kmean_model = KMeans(n_clusters=15)
    kmean_model.fit(infered_vectors_list)
    labels = kmean_model.predict(infered_vectors_list[0:100])
    print(labels)
    cluster_centers = kmean_model.cluster_centers_

    with open(result_text_path, 'w') as wf:
        i = 0
        while i < len(x_train):
            string = ""
            text = x_train[i][0]
            for word in text:
                string = string + word
            string = string + '\t'
            string = string + str(labels[i])
            string = string + '\n'
            wf.write(string)
            i = i + 1

    return cluster_centers


if __name__ == '__main__':
    segment()
    x_train = get_datasest()
    model_dm = train(x_train)
    cluster_centers = cluster(x_train)

 


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM