本文使用word2vec(100維)做聚類,訓練文本中一行是一條數據(已分詞),具體代碼如下:
from sklearn.cluster import KMeans from sklearn import preprocessing from sklearn.feature_extraction.text import CountVectorizer from sklearn.feature_extraction.text import TfidfTransformer #from sklearn.decomposition import PCA from gensim.models import Word2Vec import nltk from nltk.corpus import stopwords #from sklearn.model_selection import train_test_split import random import matplotlib.pyplot as plt %matplotlib inline #from sklearn.datasets.samples_generator import make_blob
加載文本:
sents = [] #sents:已分好詞的文件,一行是一條數據,已經分好詞並去掉停用詞 with open('generate_data/sents_for_kmeans.txt','r',encoding='utf-8') as f: for line in f: sents.append(line.replace('\n',''))
文本去重:
sents = list(set(sents)) print(len(sents)) print(sents[10])
結果如下:
訓練word2vec模型:
all_words = [sent.split(' ') for sent in sents] word2vec = Word2Vec(all_words)
查看詞典:
vocabulary = word2vec.wv.vocab print(vocabulary.keys()) len(vocabulary)
將所有的詞向量匯合到一個list中:
vectors = [] for item in vocabulary: vectors.append(word2vec.wv[item])
訓練kmeans模型:
num_clusters = 2 km_cluster = KMeans(n_clusters=num_clusters, max_iter=300, n_init=40, init='k-means++',n_jobs=-1) #返回各自文本的所被分配到的類索引 #result = km_cluster.fit_predict(vectors) #print("Predicting result: ", result) km_cluster.fit(vectors)
圖形化展示:
cents = km_cluster.cluster_centers_ labels = km_cluster.labels_ inertia = km_cluster.inertia_ mark = ['or','ob'] color = 0 j = 0 for i in labels: #print(vectors[j]) plt.plot(vectors[j],mark[i],markersize=5) j += 1 plt.show()