python jieba分詞小說與詞頻統計

本文轉載自查看原文 2019-06-11 15:29 2162 python學習之路

1、知識點

"""
1)cut()
    a) codecs.open() 解決編碼問題
    b) f.readline() 讀取一行，也可以使用f.readlines()讀取多行
    c) words =" ".join(jieba.cut(line))分詞，每個詞用空格分隔
2)lcut()
    返回一個list列表
"""

2、標點符號處理，並分詞,存儲到文件中

def fenCi():
    """
    標點符號處理，並分詞,存儲到文件中
    :return:
    """
    f = codecs.open("深淵主宰系統.txt",'r',encoding='utf-8')
    f1 = open("seg.txt",'w',encoding='utf-8')
    line = f.readline()
    while line:
        line = line.strip(' ')
        words =" ".join(jieba.cut(line))
        words = words.replace("，","").replace("！","").replace("“","")\
            .replace("”","").replace("。","").replace("？","").replace("：","")\
            .replace("...","").replace("、","").strip(' ')
        print(len(words))
        if words.startswith('-') or words == '\r\n' or words.startswith('.') or len(words)<10 :
            line = f.readline()
            continue
        words = words.strip('\n')
        f1.writelines(words)
        line = f.readline()

3、中文分詞統計

def zhongwen():
    """
    中文分詞統計
    對兩個詞以上的次數進行統計
        lcut 進行分詞，返回分詞后list列表
    :return:
    """
    f = codecs.open("深淵主宰系統.txt", 'r', encoding='utf-8').read()
    counts = {}
    wordsList =jieba.lcut(f)
    for word in wordsList:
        word = word.replace("，", "").replace("！", "").replace("“", "") \
            .replace("”", "").replace("。", "").replace("？", "").replace("：", "") \
            .replace("...", "").replace("、", "").strip(' ').strip('\r\n')
        if len(word) == 1 or word == "":
            continue
        else:
            counts[word]=counts.get(word,0)+1 #單詞計數
    items = list(counts.items()) #將字典轉為list
    items.sort(key=lambda x:x[1],reverse=True) #根據單詞出現次數降序排序
    #打印前15個
    for i in range(15):
        word,counter = items[i]
        print("單詞：{},次數：{}".format(word,counter))

4、英文分詞統計

def get_txt():
    txt = open("1.txt", "r", encoding='UTF-8').read()
    txt = txt.lower()
    for ch in '!"#$%&()*+,-./:;<=>?@[\\]^_‘{|}~':
        txt = txt.replace(ch, " ")      # 將文本中特殊字符替換為空格
    return txt

def yingwen():
    """
    英文分詞統計
    :return:
    """
    file_txt = get_txt()
    words = file_txt.split()    # 對字符串進行分割，獲得單詞列表
    counts = {}
    for word in words:
        if len(word) == 1:
            continue
        else:
            counts[word] = counts.get(word, 0) + 1

    items = list(counts.items())
    items.sort(key=lambda x: x[1], reverse=True)

    for i in range(5):
        word, count = items[i]
        print("{0:<5}->{1:>5}".format(word, count))

免責聲明！

本站轉載的文章為個人學習借鑒使用，本站對版權不負任何法律責任。如果侵犯了您的隱私權益，請聯系本站郵箱yoyou2525@163.com刪除。

猜您在找 利用jieba分詞進行詞頻統計 jieba庫詞頻統計分詞————jieba分詞（Python）文本數據分詞，詞頻統計，可視化 - Python python jieba 庫分詞結合Wordcloud詞雲統計 jieba 分詞庫（python） python jieba分詞詞性 python 分詞庫jieba python結巴(jieba)分詞運用jieba庫進行詞頻統計