機器學習-文本聚類實例-kmeans
import os import gensim import jieba from gensim.models.doc2vec import Doc2Vec from sklearn.cluster import KMeans TaggededDocument = gensim.models.doc2vec.TaggedDocument # 定義兩個函數讀取和保存文件 # 保存至文件 def savefile(savepath, content, encode): fp = open(savepath, "w", encoding=encode) fp.write(content) fp.close() # 讀文件 def readfile(path, encode): content = None try: fp = open(path, "r", encoding=encode) content = fp.read() fp.close() except UnicodeDecodeError: print("Error: 文件讀取失敗") else: return content stop_words_path = '/Users/FengZhen/Desktop/accumulate/機器學習/kmeans聚類/中文停用詞表.txt' origin_text_path = '/Users/FengZhen/Desktop/accumulate/機器學習/kmeans聚類/測試文本集/' cut_combine_path = '/Users/FengZhen/Desktop/accumulate/機器學習/kmeans聚類/cut_combine.txt' corpus_text_path = '/Users/FengZhen/Desktop/accumulate/機器學習/kmeans聚類/corpus.txt' result_text_path = '/Users/FengZhen/Desktop/accumulate/機器學習/kmeans聚類/own_claasify.txt' # 分詞-去停用詞-保存 def segment(): stop_words = readfile(stop_words_path,'UTF-8') fileList = os.listdir(origin_text_path) save = open(cut_combine_path, 'w') for file in fileList: if not file.startswith("."): content_result = '' content = readfile(origin_text_path + file, 'GBK') content_words = jieba.cut(content) for content_word in content_words: if content_word not in stop_words: content_result = content_result + " " + content_word save.write(content_result.replace('\r','').replace('\n','')) save.write('\n') def get_datasest(): with open(cut_combine_path, 'r') as cf: docs = cf.readlines() print len(docs) x_train = [] for i, text in enumerate(docs): word_list = text.split(' ') l = len(word_list) word_list[l - 1] = word_list[l - 1].strip() # 訓練模型前,先將語料整理成規定的形式,這里用到TaggedDocument模型 # 輸入輸出內容都為 詞袋 + tag列表 document = TaggededDocument(word_list, tags=[i]) x_train.append(document) return x_train def train(x_train, size=200): # 初始化訓練模型的參數,再保存訓練結果以釋放內存 # 提供x_train可初始化, min_cout 忽略總頻率低於這個的所有單詞, window 預測的詞與上下文詞之間最大的距離, 用於預測 size 特征向量的維數 negative 接受雜質的個數 worker 工作模塊數 model_dm = Doc2Vec(x_train, min_count=1, window=3, size=size, sample=1e-3, negative=5, workers=4) # corpus_count是文件個數 epochs 訓練次數 model_dm.train(x_train, total_examples=model_dm.corpus_count, epochs=100) # 保存模型訓練結果,釋放內存空間,后續可用load加載 model_dm.save(corpus_text_path) return model_dm def cluster(x_train): infered_vectors_list = [] print "load doc2vec model..." # 加載訓練的模型 model_dm輸出類似Doc2Vec(dm/m,d500,n5,w3,s0.001,t4) model_dm = Doc2Vec.load(corpus_text_path) print "load train vectors..." i = 0 for text, label in x_train: vector = model_dm.infer_vector(text) infered_vectors_list.append(vector) i += 1 print "train kmean model..." kmean_model = KMeans(n_clusters=15) kmean_model.fit(infered_vectors_list) labels = kmean_model.predict(infered_vectors_list[0:100]) print(labels) cluster_centers = kmean_model.cluster_centers_ with open(result_text_path, 'w') as wf: i = 0 while i < len(x_train): string = "" text = x_train[i][0] for word in text: string = string + word string = string + '\t' string = string + str(labels[i]) string = string + '\n' wf.write(string) i = i + 1 return cluster_centers if __name__ == '__main__': segment() x_train = get_datasest() model_dm = train(x_train) cluster_centers = cluster(x_train)