上次爬取的爸爸、媽媽、老師和自己的作文,利用sklearn.neighbors.KNeighborsClassifier進行分類。
import jieba import pandas as pd import numpy as np import os import itertools import matplotlib.pyplot as plt from sklearn.feature_extraction.text import CountVectorizer from sklearn.neighbors import KNeighborsClassifier from sklearn.metrics import confusion_matrix from sklearn.decomposition import PCA #讀取文件內容 path = 'E:\作文' corpos = pd.DataFrame(columns=['filepath','text','kind']) for root,dirs,files in os.walk(path): for name in files: filepath = root+'\\'+name f = open(filepath,'r',encoding='utf-8') text = f.read() txt = ''.join(text.split('\n')) kind = root.split('\\')[-1] corpos.loc[len(corpos)] = [filepath,text.strip(),kind] #設置停用詞,構建詞頻矩陣 stopwords = pd.read_csv(r'Stopwords.txt', encoding='utf-8',sep='\n') def tokenizer(s): words=[] cut = jieba.cut(s) for word in cut: words.append(word) return words count = CountVectorizer(tokenizer=tokenizer, stop_words=list(stopwords['stopword'])) countvector = count.fit_transform(corpos.iloc[:,1]).toarray() #將類別轉化為數字 kind = np.unique(corpos['kind'].values) nkind = np.zeros(700) for i in range(len(kind)): index = corpos[corpos['kind']==kind[i]].index nkind[index] = i+1 #將詞頻矩陣轉化為二維數據,畫圖 pca = PCA(n_components=2) newvector = pca.fit_transform(countvector) plt.figure() for i,c,m in zip(range(len(kind)),['r','b','g','y'],['o','^','>','<']): index = corpos[corpos['kind']==kind[i]].index x = newvector[index,0] y = newvector[index,1] plt.scatter(x,y,c=c,marker=m,label=kind[i]) plt.legend() plt.xlim(-5,10) plt.ylim(-20,50) plt.xlabel('X Label') plt.ylabel('Y Label') #隨機選出測試集 index = np.random.randint(0,700,200) x_test = countvector[index] y_test = corpos.iloc[index,2] #利用knn分類 knn = KNeighborsClassifier() knn.fit(countvector,corpos.iloc[:,2]) y_pred = knn.predict(x_test) knn.score(x_test,y_test) #畫knn分類結果的混淆矩陣
knn_confusion = confusion_matrix(y_test,y_pred)
'''
array([[61, 1, 0, 3],
[ 8, 35, 0, 1], [ 1, 0, 53, 1], [ 9, 1, 2, 24]]) '''
plt.imshow(knn_confusion,interpolation='nearest',cmap=plt.cm.Oranges) plt.xlabel('y_pred') plt.ylabel('y_True') tick_marks = np.arange(len(kind)) plt.xticks(tick_marks,kind,rotation=90) plt.yticks(tick_marks,kind) plt.colorbar() plt.title('confustion_matrix') for i,j in itertools.product(range(len(knn_confusion)),range(len(knn_confusion))): plt.text(i,j,knn_confusion[j,i], horizontalalignment="center")
數據散點圖如下所示:

knn分類結果的混淆矩陣圖如下所示: