python jieba分詞小說與詞頻統計


1、知識點

"""
1)cut()
    a) codecs.open() 解決編碼問題
    b) f.readline() 讀取一行,也可以使用f.readlines()讀取多行
    c) words =" ".join(jieba.cut(line))分詞,每個詞用空格分隔
2)lcut()
    返回一個list列表
"""

2、標點符號處理,並分詞,存儲到文件中

def fenCi():
    """
    標點符號處理,並分詞,存儲到文件中
    :return:
    """
    f = codecs.open("深淵主宰系統.txt",'r',encoding='utf-8')
    f1 = open("seg.txt",'w',encoding='utf-8')
    line = f.readline()
    while line:
        line = line.strip(' ')
        words =" ".join(jieba.cut(line))
        words = words.replace("","").replace("","").replace("","")\
            .replace("","").replace("","").replace("","").replace("","")\
            .replace("...","").replace("","").strip(' ')
        print(len(words))
        if words.startswith('-') or words == '\r\n' or words.startswith('.') or len(words)<10 :
            line = f.readline()
            continue
        words = words.strip('\n')
        f1.writelines(words)
        line = f.readline()

3、中文分詞統計

def zhongwen():
    """
    中文分詞統計
    對兩個詞以上的次數進行統計
        lcut 進行分詞,返回分詞后list列表
    :return:
    """
    f = codecs.open("深淵主宰系統.txt", 'r', encoding='utf-8').read()
    counts = {}
    wordsList =jieba.lcut(f)
    for word in wordsList:
        word = word.replace("", "").replace("", "").replace("", "") \
            .replace("", "").replace("", "").replace("", "").replace("", "") \
            .replace("...", "").replace("", "").strip(' ').strip('\r\n')
        if len(word) == 1 or word == "":
            continue
        else:
            counts[word]=counts.get(word,0)+1 #單詞計數
    items = list(counts.items()) #將字典轉為list
    items.sort(key=lambda x:x[1],reverse=True) #根據單詞出現次數降序排序
    #打印前15個
    for i in range(15):
        word,counter = items[i]
        print("單詞:{},次數:{}".format(word,counter))

4、英文分詞統計

def get_txt():
    txt = open("1.txt", "r", encoding='UTF-8').read()
    txt = txt.lower()
    for ch in '!"#$%&()*+,-./:;<=>?@[\\]^_‘{|}~':
        txt = txt.replace(ch, " ")      # 將文本中特殊字符替換為空格
    return txt

def yingwen():
    """
    英文分詞統計
    :return:
    """
    file_txt = get_txt()
    words = file_txt.split()    # 對字符串進行分割,獲得單詞列表
    counts = {}
    for word in words:
        if len(word) == 1:
            continue
        else:
            counts[word] = counts.get(word, 0) + 1

    items = list(counts.items())
    items.sort(key=lambda x: x[1], reverse=True)

    for i in range(5):
        word, count = items[i]
        print("{0:<5}->{1:>5}".format(word, count))

 


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM