文本词频统计 -- Hamlet
Hamlet下载
链接:https://pan.baidu.com/s/1Is2mBAED57i6nI38lcnXAA 提取码:zqw1
def getText(): txt = open("hamlet.txt","r").read() #打开文件 r 读权限 txt = txt.lower() #把英文字母全部变成小写 for ch in '!"$%&()*+,-./:;<=>?@[\\]^_{}|·`''': txt = txt.replace(ch," ") #特殊符号替换为空格 return txt hamletTxt = getText() words = hamletTxt.split() #split默认以空格为分隔符,返回列表 counts = {} #定义一个空字典类型,因为一个单词和对应的出现次数 for word in words: #循环取出单词放到空字典当作key counts[word] = counts.get(word,0) +1 #用key查询出现次数,每出现一次+1(如果不存在返回0) items = list(counts.items()) #取出字典的键和值 并返回列表类型 print(items) #[('the', 1138), ('tragedy', 3)] items.sort(key=lambda x:x[1],reverse=True) #排序字典中的value,出现次数 for i in range(10): word,count = items[i] print("{0:<6}{1:>9}".format(word,count))