使用jieba庫進行分詞
安裝jieba就不說了,自行百度!
import jieba
將標題分詞,並轉為list
seg_list = list(jieba.cut(result.get("title"), cut_all=False))
所有標題使用空格連接,方便后面做自然語言處理
para = para + " ".join(seg_list)
將分詞后的標題(使用空格分割的標題)放到一個list里面
summaryList.insert(0," ".join(seg_list))
統計詞頻
from nltk.tokenize import WordPunctTokenizer import nltk tokenizer = WordPunctTokenizer() #統計詞頻 sentences = tokenizer.tokenize(para)#此處將para轉為list(16進制字符) wordFreq=nltk.FreqDist(sentences) for i in wordFreq:print i,wordFreq[i]
轉化為詞袋,這一步的輸入是一系列的句子(詞與詞之間使用空格分開),構成的列表。得到的結果是句子中關鍵詞構成的一個列表,稱為詞袋
#轉換為詞袋 vectorizer = CountVectorizer(min_df=1,max_df=50) #summaryList 是一個列表,每一個元素是一個句子 詞與詞之間使用空格分開,默認不會處理單個詞(即一個漢字的就會忽略) #可以通過修改vectorizer的正則表達式,解決不處理單個字的問題 vectorizer.token_pattern='(?u)\\b\\w+\\b' X = vectorizer.fit_transform(summaryList) print X.shape nums,features=X.shape #帖子數量和詞袋中的詞數,通過X.getrow(i) 獲取每個帖子對應的向量 print vectorizer print str(vectorizer.get_feature_names()).decode("unicode-escape")
一個計算歐式距離的函數
#計算歐式距離 def dist_raw(v1,v2): delta=v1-v2 return sp.linalg.norm(delta.toarray())
計算新帖的向量
#測試 new_para='我要吃蘋果不吃香蕉' new_para_list=" ".join(list(jieba.cut(new_para, cut_all=False))) new_vec=vectorizer.transform([new_para_list])#new_para_list 是一個句子,詞之間使用空格分開 print 'new_vec:',new_vec
計算新帖字與原帖子的距離
for i in range(0,nums): para = paras[i] para_vec=X.getrow(i) d=dist_raw(new_vec,para_vec) print para," = ",d
所有代碼:
#!/usr/bin/python # -*- coding: utf-8 -*- print 'test OK' import sys from nltk.tokenize import WordPunctTokenizer import nltk import jieba from sklearn.feature_extraction.text import CountVectorizer import scipy as sp reload(sys) sys.setdefaultencoding("utf-8") tokenizer = WordPunctTokenizer() summaryList = []; file=open("./para.txt") paras=file.readlines() words="" for para in paras: print para seg_list = list(jieba.cut(para, cut_all=False)) words +=" ".join(seg_list) summaryList.insert(0," ".join(seg_list)) #para='I like eat apple because apple is red but because I love fruit' #統計詞頻 sentences = tokenizer.tokenize(words)#此處將para轉為list #print sentences wordFreq=nltk.FreqDist(sentences) print str(wordFreq.keys()).decode("unicode-escape") #print dir(wordFreq) for i in wordFreq: print i,wordFreq[i] print str(summaryList).decode("unicode-escape") #轉換為詞袋 vectorizer = CountVectorizer(min_df=1,max_df=50) #summaryList 是一個列表,每一個元素是一個句子 詞與詞之間使用空格分開,默認不會處理單個詞(即一個漢字的就會忽略) #可以通過修改vectorizer的正則表達式,解決不處理單個字的問題 vectorizer.token_pattern='(?u)\\b\\w+\\b' X = vectorizer.fit_transform(summaryList) print str(vectorizer.get_feature_names()).decode("unicode-escape") print X.shape nums,features=X.shape #帖子數量和詞袋中的詞數 #計算歐式距離 def dist_raw(v1,v2): delta=v1-v2 return sp.linalg.norm(delta.toarray()) #測試 new_para='我要吃蘋果不吃香蕉' new_para_list=" ".join(list(jieba.cut(new_para, cut_all=False))) new_vec=vectorizer.transform([new_para_list])#new_para_list 是一個句子,詞之間使用空格分開 print 'new_vec:',new_vec for i in range(0,nums): para = paras[i] para_vec=X.getrow(i) d=dist_raw(new_vec,para_vec) print para," = ",d
版本二:
1 #!/usr/bin/python 2 # -*- coding: utf-8 -*- 3 print 'test OK' 4 import sys 5 from nltk.tokenize import WordPunctTokenizer 6 import nltk 7 import jieba 8 from sklearn.feature_extraction.text import CountVectorizer 9 import scipy as sp 10 11 reload(sys) 12 sys.setdefaultencoding("utf-8") 13 14 tokenizer = WordPunctTokenizer() 15 summaryList = []; 16 file=open("./para.txt") 17 paras=file.readlines() 18 words="" 19 for para in paras: 20 print para 21 seg_list = list(jieba.cut(para, cut_all=False)) 22 words +=" ".join(seg_list) 23 summaryList.insert(0," ".join(seg_list)) 24 #para='I like eat apple because apple is red but because I love fruit' 25 #統計詞頻 26 sentences = tokenizer.tokenize(words)#此處將para轉為list 27 #print sentences 28 wordFreq=nltk.FreqDist(sentences) 29 print str(wordFreq.keys()).decode("unicode-escape") 30 #print dir(wordFreq) 31 32 print str(summaryList).decode("unicode-escape") 33 #轉換為詞袋 34 vectorizer = CountVectorizer(min_df=0,max_df=20) 35 #summaryList 是一個列表,每一個元素是一個句子 詞與詞之間使用空格分開,默認不會處理單個詞(即一個漢字的就會忽略) 36 #可以通過修改vectorizer的正則表達式,解決不處理單個字的問題 37 #vectorizer.token_pattern='(?u)\\b\\w+\\b' 38 X = vectorizer.fit_transform(summaryList) 39 print str(vectorizer.get_feature_names()).decode("unicode-escape") 40 print X.shape 41 nums,features=X.shape #帖子數量和詞袋中的詞數 42 43 #計算歐式距離 44 def dist_raw(v1,v2): 45 delta=v1-v2 46 return sp.linalg.norm(delta.toarray()) 47 48 #測試 49 new_para='夏季新款清新碎花雪紡連衣裙,收腰顯瘦設計;小V領、小碎花、荷葉袖、荷葉邊的結合使得這款連衣裙更顯精致,清新且顯氣質。' 50 new_para_list=" ".join(list(jieba.cut(new_para, cut_all=False))) 51 new_vec=vectorizer.transform([new_para_list])#new_para_list 是一個句子,詞之間使用空格分開 52 #print 'new_vec:',new_vec.toarray() 53 54 minDis = 9999 55 title="" 56 for i in range(0,nums): 57 para = summaryList[i] 58 para_vec=X.getrow(i) 59 d=dist_raw(new_vec,para_vec) 60 #print X.getrow(i).toarray(),' = ',d 61 if(minDis > d): 62 minDis = d 63 title = para 64 print title," = ",d 65 print new_para_list 66 print title
運行結果:
