比較簡單的功能,需求只到了這里,所以也就沒有繼續下去了。
1 # -*- encoding: utf-8 -*- 2 # by sorcerdu 3 #基本功能和用法在提示中 4 #原理是利用分隔符分詞存入列表,然后從列表讀出存入字典,鍵為詞,值存放詞的數量 5 #中文統計詞頻的話,得先分詞后再進行。 6 import os,string,codecs 7 import sys,time 8 9 def readfile(): 10 wordlist=[] 11 base=open('base.txt','r') 12 baseinfo=base.readlines() 13 tagf=open('tag.txt','r') 14 tagfinfo=tagf.readlines() 15 for i in tagfinfo: 16 tags=i.split(' ') 17 for i in baseinfo: 18 words=i.split(' ') 19 for word in words: 20 if word != '\t'and word != '\n' and word!=' ' and word != '' and word>=2: 21 word=word.replace('\t','') 22 word=word.replace('\n','') 23 word=word.replace(' ','') 24 word=word.replace('.\n','') 25 if word!='': 26 wordlist.append(word) 27 ## tags=['.','"',',','!','?','(',')'] 28 for x in range(len(tags)): 29 tag=tags[x] 30 for k in range(len(wordlist)): 31 if tag in wordlist[k]: #用符號分割 32 words=wordlist[k].split(tag) 33 del wordlist[k] 34 for j in range(len(words)): #去掉判斷后的空字符 35 if words[j]!='': 36 wordlist.append(words[j]) 37 38 39 40 base.close() 41 tagf.close() 42 return wordlist 43 44 45 46 def getstr(word,count,allwordnum): 47 countstr=word+'--------'+str(count)+'--------'+str(allwordnum) 48 return countstr 49 50 if __name__=="__main__": 51 wordcnt={} 52 wordlist=readfile() 53 wordlistall=wordlist 54 allwordnum=len(wordlistall) 55 outdata=open('count.txt','w') 56 print '******************************************' 57 print(u'提示:') 58 print(u' 1、要統計的文章放置於本程序路徑下的base.txt中') 59 print(u' 2、單詞分割符存放在本程序路徑下的tag.txt中,以空格為分隔符,默認已對換碼符,換行符,空格,句號(英文)處理') 60 print(u' 3、統計的結果保存在本程序路徑下的count.txt中') 61 print '******************************************' 62 print(u"開始統計咯......") 63 64 print'------------------------------------------------------------------------' 65 for i in wordlistall: 66 if i in wordcnt: 67 wordcnt[i]+=1 68 else: 69 wordcnt[i]=1 70 for word,cnt in wordcnt.iteritems(): 71 print word+'--------'+str(cnt)+'--------'+str(allwordnum) 72 outdata.write(getstr(word,cnt,allwordnum)+'\n') 73 74 print'------------------------------------------------------------------------' 75 print(u"完成") 76 print(u'按任意鍵退出') 77 outdata.close() 78 os.system("pause")