python機器學習——分詞


使用jieba庫進行分詞

安裝jieba就不說了,自行百度!

import jieba

將標題分詞,並轉為list

seg_list = list(jieba.cut(result.get("title"), cut_all=False))

所有標題使用空格連接,方便后面做自然語言處理

para = para + " ".join(seg_list)

將分詞后的標題(使用空格分割的標題)放到一個list里面

summaryList.insert(0," ".join(seg_list))

 

統計詞頻

from nltk.tokenize import WordPunctTokenizer
import nltk

tokenizer = WordPunctTokenizer()
#統計詞頻
sentences = tokenizer.tokenize(para)#此處將para轉為list(16進制字符)
wordFreq=nltk.FreqDist(sentences)
for i in wordFreq:print i,wordFreq[i]

 

轉化為詞袋,這一步的輸入是一系列的句子(詞與詞之間使用空格分開),構成的列表。得到的結果是句子中關鍵詞構成的一個列表,稱為詞袋

#轉換為詞袋
vectorizer = CountVectorizer(min_df=1,max_df=50)
#summaryList 是一個列表,每一個元素是一個句子 詞與詞之間使用空格分開,默認不會處理單個詞(即一個漢字的就會忽略)
#可以通過修改vectorizer的正則表達式,解決不處理單個字的問題
vectorizer.token_pattern='(?u)\\b\\w+\\b'
X = vectorizer.fit_transform(summaryList)

print X.shape
nums,features=X.shape   #帖子數量和詞袋中的詞數,通過X.getrow(i)  獲取每個帖子對應的向量

print vectorizer print str(vectorizer.get_feature_names()).decode("unicode-escape")

 

一個計算歐式距離的函數

#計算歐式距離
def dist_raw(v1,v2):
    delta=v1-v2
    return sp.linalg.norm(delta.toarray())

 

計算新帖的向量

#測試
new_para='我要吃蘋果不吃香蕉'
new_para_list=" ".join(list(jieba.cut(new_para, cut_all=False)))
new_vec=vectorizer.transform([new_para_list])#new_para_list 是一個句子,詞之間使用空格分開
print 'new_vec:',new_vec

 

計算新帖字與原帖子的距離

for i in range(0,nums):
    para = paras[i]
    para_vec=X.getrow(i)
    d=dist_raw(new_vec,para_vec)
    print para," = ",d

 

 

所有代碼:

#!/usr/bin/python
# -*- coding: utf-8 -*-
print 'test OK'
import sys
from nltk.tokenize import WordPunctTokenizer
import nltk
import jieba
from sklearn.feature_extraction.text import CountVectorizer
import scipy as sp

reload(sys)
sys.setdefaultencoding("utf-8")

tokenizer = WordPunctTokenizer()
summaryList = [];
file=open("./para.txt")
paras=file.readlines()
words=""
for para in paras:
    print para
    seg_list = list(jieba.cut(para, cut_all=False))
    words +=" ".join(seg_list)
    summaryList.insert(0," ".join(seg_list))
#para='I like eat apple because apple is red but because I love fruit'
#統計詞頻
sentences = tokenizer.tokenize(words)#此處將para轉為list
#print sentences
wordFreq=nltk.FreqDist(sentences)
print str(wordFreq.keys()).decode("unicode-escape")
#print dir(wordFreq)
for i in wordFreq:
    print i,wordFreq[i]

print str(summaryList).decode("unicode-escape")
#轉換為詞袋
vectorizer = CountVectorizer(min_df=1,max_df=50)
#summaryList 是一個列表,每一個元素是一個句子 詞與詞之間使用空格分開,默認不會處理單個詞(即一個漢字的就會忽略)
#可以通過修改vectorizer的正則表達式,解決不處理單個字的問題
vectorizer.token_pattern='(?u)\\b\\w+\\b'
X = vectorizer.fit_transform(summaryList)
print str(vectorizer.get_feature_names()).decode("unicode-escape")
print X.shape
nums,features=X.shape   #帖子數量和詞袋中的詞數

#計算歐式距離
def dist_raw(v1,v2):
    delta=v1-v2
    return sp.linalg.norm(delta.toarray())

#測試
new_para='我要吃蘋果不吃香蕉'
new_para_list=" ".join(list(jieba.cut(new_para, cut_all=False)))
new_vec=vectorizer.transform([new_para_list])#new_para_list 是一個句子,詞之間使用空格分開
print 'new_vec:',new_vec

for i in range(0,nums):
    para = paras[i]
    para_vec=X.getrow(i)
    d=dist_raw(new_vec,para_vec)
    print para," = ",d
View Code

 

版本二:

 1 #!/usr/bin/python
 2 # -*- coding: utf-8 -*-
 3 print 'test OK'
 4 import sys
 5 from nltk.tokenize import WordPunctTokenizer
 6 import nltk
 7 import jieba
 8 from sklearn.feature_extraction.text import CountVectorizer
 9 import scipy as sp
10 
11 reload(sys)
12 sys.setdefaultencoding("utf-8")
13 
14 tokenizer = WordPunctTokenizer()
15 summaryList = [];
16 file=open("./para.txt")
17 paras=file.readlines()
18 words=""
19 for para in paras:
20     print para
21     seg_list = list(jieba.cut(para, cut_all=False))
22     words +=" ".join(seg_list)
23     summaryList.insert(0," ".join(seg_list))
24 #para='I like eat apple because apple is red but because I love fruit'
25 #統計詞頻
26 sentences = tokenizer.tokenize(words)#此處將para轉為list
27 #print sentences
28 wordFreq=nltk.FreqDist(sentences)
29 print str(wordFreq.keys()).decode("unicode-escape")
30 #print dir(wordFreq)
31 
32 print str(summaryList).decode("unicode-escape")
33 #轉換為詞袋
34 vectorizer = CountVectorizer(min_df=0,max_df=20)
35 #summaryList 是一個列表,每一個元素是一個句子 詞與詞之間使用空格分開,默認不會處理單個詞(即一個漢字的就會忽略)
36 #可以通過修改vectorizer的正則表達式,解決不處理單個字的問題
37 #vectorizer.token_pattern='(?u)\\b\\w+\\b'
38 X = vectorizer.fit_transform(summaryList)
39 print str(vectorizer.get_feature_names()).decode("unicode-escape")
40 print X.shape
41 nums,features=X.shape   #帖子數量和詞袋中的詞數
42 
43 #計算歐式距離
44 def dist_raw(v1,v2):
45     delta=v1-v2
46     return sp.linalg.norm(delta.toarray())
47 
48 #測試
49 new_para='夏季新款清新碎花雪紡連衣裙,收腰顯瘦設計;小V領、小碎花、荷葉袖、荷葉邊的結合使得這款連衣裙更顯精致,清新且顯氣質。'
50 new_para_list=" ".join(list(jieba.cut(new_para, cut_all=False)))
51 new_vec=vectorizer.transform([new_para_list])#new_para_list 是一個句子,詞之間使用空格分開
52 #print 'new_vec:',new_vec.toarray()
53 
54 minDis = 9999
55 title=""
56 for i in range(0,nums):
57     para = summaryList[i]
58     para_vec=X.getrow(i)
59     d=dist_raw(new_vec,para_vec)
60     #print X.getrow(i).toarray(),' = ',d
61     if(minDis > d):
62         minDis = d
63         title = para
64 print title," = ",d
65 print new_para_list
66 print title
View Code

 

運行結果:


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM