1、知識點
""" 1)cut() a) codecs.open() 解決編碼問題 b) f.readline() 讀取一行,也可以使用f.readlines()讀取多行 c) words =" ".join(jieba.cut(line))分詞,每個詞用空格分隔 2)lcut() 返回一個list列表 """
2、標點符號處理,並分詞,存儲到文件中
def fenCi(): """ 標點符號處理,並分詞,存儲到文件中 :return: """ f = codecs.open("深淵主宰系統.txt",'r',encoding='utf-8') f1 = open("seg.txt",'w',encoding='utf-8') line = f.readline() while line: line = line.strip(' ') words =" ".join(jieba.cut(line)) words = words.replace(",","").replace("!","").replace("“","")\ .replace("”","").replace("。","").replace("?","").replace(":","")\ .replace("...","").replace("、","").strip(' ') print(len(words)) if words.startswith('-') or words == '\r\n' or words.startswith('.') or len(words)<10 : line = f.readline() continue words = words.strip('\n') f1.writelines(words) line = f.readline()
3、中文分詞統計
def zhongwen(): """ 中文分詞統計 對兩個詞以上的次數進行統計 lcut 進行分詞,返回分詞后list列表 :return: """ f = codecs.open("深淵主宰系統.txt", 'r', encoding='utf-8').read() counts = {} wordsList =jieba.lcut(f) for word in wordsList: word = word.replace(",", "").replace("!", "").replace("“", "") \ .replace("”", "").replace("。", "").replace("?", "").replace(":", "") \ .replace("...", "").replace("、", "").strip(' ').strip('\r\n') if len(word) == 1 or word == "": continue else: counts[word]=counts.get(word,0)+1 #單詞計數 items = list(counts.items()) #將字典轉為list items.sort(key=lambda x:x[1],reverse=True) #根據單詞出現次數降序排序 #打印前15個 for i in range(15): word,counter = items[i] print("單詞:{},次數:{}".format(word,counter))
4、英文分詞統計
def get_txt(): txt = open("1.txt", "r", encoding='UTF-8').read() txt = txt.lower() for ch in '!"#$%&()*+,-./:;<=>?@[\\]^_‘{|}~': txt = txt.replace(ch, " ") # 將文本中特殊字符替換為空格 return txt def yingwen(): """ 英文分詞統計 :return: """ file_txt = get_txt() words = file_txt.split() # 對字符串進行分割,獲得單詞列表 counts = {} for word in words: if len(word) == 1: continue else: counts[word] = counts.get(word, 0) + 1 items = list(counts.items()) items.sort(key=lambda x: x[1], reverse=True) for i in range(5): word, count = items[i] print("{0:<5}->{1:>5}".format(word, count))