使用jieba库进行分词
安装jieba就不说了,自行百度!
import jieba
将标题分词,并转为list
seg_list = list(jieba.cut(result.get("title"), cut_all=False))
所有标题使用空格连接,方便后面做自然语言处理
para = para + " ".join(seg_list)
将分词后的标题(使用空格分割的标题)放到一个list里面
summaryList.insert(0," ".join(seg_list))
统计词频
from nltk.tokenize import WordPunctTokenizer import nltk tokenizer = WordPunctTokenizer() #统计词频 sentences = tokenizer.tokenize(para)#此处将para转为list(16进制字符) wordFreq=nltk.FreqDist(sentences) for i in wordFreq:print i,wordFreq[i]
转化为词袋,这一步的输入是一系列的句子(词与词之间使用空格分开),构成的列表。得到的结果是句子中关键词构成的一个列表,称为词袋
#转换为词袋 vectorizer = CountVectorizer(min_df=1,max_df=50) #summaryList 是一个列表,每一个元素是一个句子 词与词之间使用空格分开,默认不会处理单个词(即一个汉字的就会忽略) #可以通过修改vectorizer的正则表达式,解决不处理单个字的问题 vectorizer.token_pattern='(?u)\\b\\w+\\b' X = vectorizer.fit_transform(summaryList) print X.shape nums,features=X.shape #帖子数量和词袋中的词数,通过X.getrow(i) 获取每个帖子对应的向量 print vectorizer print str(vectorizer.get_feature_names()).decode("unicode-escape")
一个计算欧式距离的函数
#计算欧式距离 def dist_raw(v1,v2): delta=v1-v2 return sp.linalg.norm(delta.toarray())
计算新帖的向量
#测试 new_para='我要吃苹果不吃香蕉' new_para_list=" ".join(list(jieba.cut(new_para, cut_all=False))) new_vec=vectorizer.transform([new_para_list])#new_para_list 是一个句子,词之间使用空格分开 print 'new_vec:',new_vec
计算新帖字与原帖子的距离
for i in range(0,nums): para = paras[i] para_vec=X.getrow(i) d=dist_raw(new_vec,para_vec) print para," = ",d
所有代码:

#!/usr/bin/python # -*- coding: utf-8 -*- print 'test OK' import sys from nltk.tokenize import WordPunctTokenizer import nltk import jieba from sklearn.feature_extraction.text import CountVectorizer import scipy as sp reload(sys) sys.setdefaultencoding("utf-8") tokenizer = WordPunctTokenizer() summaryList = []; file=open("./para.txt") paras=file.readlines() words="" for para in paras: print para seg_list = list(jieba.cut(para, cut_all=False)) words +=" ".join(seg_list) summaryList.insert(0," ".join(seg_list)) #para='I like eat apple because apple is red but because I love fruit' #统计词频 sentences = tokenizer.tokenize(words)#此处将para转为list #print sentences wordFreq=nltk.FreqDist(sentences) print str(wordFreq.keys()).decode("unicode-escape") #print dir(wordFreq) for i in wordFreq: print i,wordFreq[i] print str(summaryList).decode("unicode-escape") #转换为词袋 vectorizer = CountVectorizer(min_df=1,max_df=50) #summaryList 是一个列表,每一个元素是一个句子 词与词之间使用空格分开,默认不会处理单个词(即一个汉字的就会忽略) #可以通过修改vectorizer的正则表达式,解决不处理单个字的问题 vectorizer.token_pattern='(?u)\\b\\w+\\b' X = vectorizer.fit_transform(summaryList) print str(vectorizer.get_feature_names()).decode("unicode-escape") print X.shape nums,features=X.shape #帖子数量和词袋中的词数 #计算欧式距离 def dist_raw(v1,v2): delta=v1-v2 return sp.linalg.norm(delta.toarray()) #测试 new_para='我要吃苹果不吃香蕉' new_para_list=" ".join(list(jieba.cut(new_para, cut_all=False))) new_vec=vectorizer.transform([new_para_list])#new_para_list 是一个句子,词之间使用空格分开 print 'new_vec:',new_vec for i in range(0,nums): para = paras[i] para_vec=X.getrow(i) d=dist_raw(new_vec,para_vec) print para," = ",d
版本二:

1 #!/usr/bin/python 2 # -*- coding: utf-8 -*- 3 print 'test OK' 4 import sys 5 from nltk.tokenize import WordPunctTokenizer 6 import nltk 7 import jieba 8 from sklearn.feature_extraction.text import CountVectorizer 9 import scipy as sp 10 11 reload(sys) 12 sys.setdefaultencoding("utf-8") 13 14 tokenizer = WordPunctTokenizer() 15 summaryList = []; 16 file=open("./para.txt") 17 paras=file.readlines() 18 words="" 19 for para in paras: 20 print para 21 seg_list = list(jieba.cut(para, cut_all=False)) 22 words +=" ".join(seg_list) 23 summaryList.insert(0," ".join(seg_list)) 24 #para='I like eat apple because apple is red but because I love fruit' 25 #统计词频 26 sentences = tokenizer.tokenize(words)#此处将para转为list 27 #print sentences 28 wordFreq=nltk.FreqDist(sentences) 29 print str(wordFreq.keys()).decode("unicode-escape") 30 #print dir(wordFreq) 31 32 print str(summaryList).decode("unicode-escape") 33 #转换为词袋 34 vectorizer = CountVectorizer(min_df=0,max_df=20) 35 #summaryList 是一个列表,每一个元素是一个句子 词与词之间使用空格分开,默认不会处理单个词(即一个汉字的就会忽略) 36 #可以通过修改vectorizer的正则表达式,解决不处理单个字的问题 37 #vectorizer.token_pattern='(?u)\\b\\w+\\b' 38 X = vectorizer.fit_transform(summaryList) 39 print str(vectorizer.get_feature_names()).decode("unicode-escape") 40 print X.shape 41 nums,features=X.shape #帖子数量和词袋中的词数 42 43 #计算欧式距离 44 def dist_raw(v1,v2): 45 delta=v1-v2 46 return sp.linalg.norm(delta.toarray()) 47 48 #测试 49 new_para='夏季新款清新碎花雪纺连衣裙,收腰显瘦设计;小V领、小碎花、荷叶袖、荷叶边的结合使得这款连衣裙更显精致,清新且显气质。' 50 new_para_list=" ".join(list(jieba.cut(new_para, cut_all=False))) 51 new_vec=vectorizer.transform([new_para_list])#new_para_list 是一个句子,词之间使用空格分开 52 #print 'new_vec:',new_vec.toarray() 53 54 minDis = 9999 55 title="" 56 for i in range(0,nums): 57 para = summaryList[i] 58 para_vec=X.getrow(i) 59 d=dist_raw(new_vec,para_vec) 60 #print X.getrow(i).toarray(),' = ',d 61 if(minDis > d): 62 minDis = d 63 title = para 64 print title," = ",d 65 print new_para_list 66 print title
运行结果: