python使用KNN文本分類

本文轉載自查看原文 2017-09-01 06:14 5003 python/ sklearn/ KNN

上次爬取的爸爸、媽媽、老師和自己的作文，利用sklearn.neighbors.KNeighborsClassifier進行分類。

import jieba
import pandas as pd
import numpy as np
import os
import itertools 
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix
from sklearn.decomposition import PCA

#讀取文件內容
path = 'E:\作文'
corpos = pd.DataFrame(columns=['filepath','text','kind'])
for root,dirs,files in os.walk(path):
    for name in files:
        filepath = root+'\\'+name
        f = open(filepath,'r',encoding='utf-8')
        text = f.read()
        txt = ''.join(text.split('\n'))
        kind = root.split('\\')[-1]
        corpos.loc[len(corpos)] = [filepath,text.strip(),kind]

#設置停用詞，構建詞頻矩陣
stopwords = pd.read_csv(r'Stopwords.txt', 
                        encoding='utf-8',sep='\n')
def tokenizer(s):
    words=[]
    cut = jieba.cut(s)
    for word in cut:
        words.append(word)
    return words
count = CountVectorizer(tokenizer=tokenizer,
                        stop_words=list(stopwords['stopword']))
countvector = count.fit_transform(corpos.iloc[:,1]).toarray()

#將類別轉化為數字
kind = np.unique(corpos['kind'].values)
nkind = np.zeros(700) 
for i in range(len(kind)):
    index = corpos[corpos['kind']==kind[i]].index
    nkind[index] = i+1
         
#將詞頻矩陣轉化為二維數據，畫圖    
pca = PCA(n_components=2)
newvector = pca.fit_transform(countvector)
plt.figure()
for i,c,m in zip(range(len(kind)),['r','b','g','y'],['o','^','>','<']):
    index = corpos[corpos['kind']==kind[i]].index
    x = newvector[index,0]
    y = newvector[index,1]
    plt.scatter(x,y,c=c,marker=m,label=kind[i])
plt.legend()
plt.xlim(-5,10)
plt.ylim(-20,50)
plt.xlabel('X Label')
plt.ylabel('Y Label')

#隨機選出測試集    
index = np.random.randint(0,700,200) 
x_test = countvector[index]
y_test = corpos.iloc[index,2]


#利用knn分類
knn = KNeighborsClassifier()
knn.fit(countvector,corpos.iloc[:,2])
y_pred = knn.predict(x_test)
knn.score(x_test,y_test)

#畫knn分類結果的混淆矩陣

knn_confusion = confusion_matrix(y_test,y_pred)
'''
array([[61,  1,  0,  3],

 [ 8, 35, 0, 1], [ 1, 0, 53, 1], [ 9, 1, 2, 24]]) '''

plt.imshow(knn_confusion,interpolation='nearest',cmap=plt.cm.Oranges) plt.xlabel('y_pred') plt.ylabel('y_True') tick_marks = np.arange(len(kind)) plt.xticks(tick_marks,kind,rotation=90) plt.yticks(tick_marks,kind) plt.colorbar() plt.title('confustion_matrix') for i,j in itertools.product(range(len(knn_confusion)),range(len(knn_confusion))): plt.text(i,j,knn_confusion[j,i], horizontalalignment="center")

數據散點圖如下所示：

knn分類結果的混淆矩陣圖如下所示：

免責聲明！

本站轉載的文章為個人學習借鑒使用，本站對版權不負任何法律責任。如果侵犯了您的隱私權益，請聯系本站郵箱yoyou2525@163.com刪除。

猜您在找 基於KNN的newsgroup 18828文本分類器的Python實現使用Python進行語義相似度/文本分類使用TextCNN實現文本分類文本分類TextCNN 文本分類：survey 文本分類模型 CNN 文本分類如何使用BERT實現中文的文本分類（附代碼）機器學習-文本分類（2）-新聞文本分類文本分類問題匯總