[Python]jieba切詞添加字典去除停用詞、單字 python 2020.2.10

本文轉載自查看原文 2020-02-10 01:14 2205 Python 3/ 生活日志

源碼如下：

 1 import jieba
 2 import io
 3 import re
 4 
 5 #jieba.load_userdict("E:/xinxi2.txt")
 6 patton=re.compile(r'..')
 7 
 8 #添加字典
 9 def add_dict():
10     f=open("E:/xinxi2.txt","r+",encoding="utf-8")  #百度爬取的字典
11     for line in f:
12         jieba.suggest_freq(line.rstrip("\n"), True)
13     f.close()
14 
15 #對句子進行分詞
16 def cut():
17     number=0
18     f=open("E:/luntan.txt","r+",encoding="utf-8")   #要處理的內容，所爬信息，CSDN論壇標題
19     for line in f:
20         line=seg_sentence(line.rstrip("\n"))
21         seg_list=jieba.cut(line)
22         for i in seg_list:
23             print(i) #打印詞匯內容
24             m=patton.findall(i)
25             #print(len(m)) #打印字符長度
26             if len(m)!=0:
27                 write(i.strip()+" ")
28         line=line.rstrip().lstrip()
29         print(len(line))#打印句子長度
30         if len(line)>1:
31             write("\n")
32         number+=1
33         print("已處理",number,"行")
34 
35 #分詞后寫入
36 def write(contents):
37     f=open("E://luntan_cut2.txt","a+",encoding="utf-8") #要寫入的文件
38     f.write(contents)
39     #print("寫入成功！")
40     f.close()
41 
42 #創建停用詞
43 def stopwordslist(filepath):
44     stopwords = [line.strip() for line in open(filepath, 'r', encoding='utf-8').readlines()]
45     return stopwords
46 
47 # 對句子進行去除停用詞
48 def seg_sentence(sentence):
49     sentence_seged = jieba.cut(sentence.strip())
50     stopwords = stopwordslist('E://stop.txt')  # 這里加載停用詞的路徑
51     outstr = ''
52     for word in sentence_seged:
53         if word not in stopwords:
54             if word != '\t':
55                 outstr += word
56                 #outstr += " "
57     return outstr
58 
59 #循環去除、無用函數
60 def cut_all():
61     inputs = open('E://luntan_cut.txt', 'r', encoding='utf-8')
62     outputs = open('E//luntan_stop.txt', 'a')
63     for line in inputs:
64         line_seg = seg_sentence(line)  # 這里的返回值是字符串
65         outputs.write(line_seg + '\n')
66     outputs.close()
67     inputs.close()
68 
69 if __name__=="__main__":
70     add_dict()
71     cut()

luntan.txt的來源，地址：https://www.cnblogs.com/zlc364624/p/12285055.html

其中停用詞可自行百度下載，或者自己創建一個txt文件夾，自行添加詞匯用換行符隔開。

百度爬取的字典在前幾期博客中可以找到，地址：https://www.cnblogs.com/zlc364624/p/12289008.html

效果如下：

import jieba
import io
import re

#jieba.load_userdict("E:/xinxi2.txt")
patton=re.compile(r'..')

#添加字典
def add_dict():
    f=open("E:/xinxi2.txt","r+",encoding="utf-8")  #百度爬取的字典
for line in f:
        jieba.suggest_freq(line.rstrip("\n"), True)
    f.close()

#對句子進行分詞
def cut():
    number=0
f=open("E:/luntan.txt","r+",encoding="utf-8")   #要處理的內容，所爬信息，CSDN論壇標題
for line in f:
        line=seg_sentence(line.rstrip("\n"))
        seg_list=jieba.cut(line)
        for i in seg_list:
            print(i) #打印詞匯內容
m=patton.findall(i)
            #print(len(m)) #打印字符長度
if len(m)!=0:
                write(i.strip()+" ")
        line=line.rstrip().lstrip()
        print(len(line))#打印句子長度
if len(line)>1:
            write("\n")
        number+=1
print("已處理",number,"行")

#分詞后寫入
def write(contents):
    f=open("E://luntan_cut2.txt","a+",encoding="utf-8") #要寫入的文件
f.write(contents)
    #print("寫入成功！")
f.close()

#創建停用詞
def stopwordslist(filepath):
    stopwords = [line.strip() for line in open(filepath, 'r', encoding='utf-8').readlines()]
    return stopwords

# 對句子進行去除停用詞
def seg_sentence(sentence):
    sentence_seged = jieba.cut(sentence.strip())
    stopwords = stopwordslist('E://stop.txt')  # 這里加載停用詞的路徑
outstr = ''
for word in sentence_seged:
        if word not in stopwords:
            if word != '\t':
                outstr += word
                #outstr += " "
return outstr

#循環去除、無用函數
def cut_all():
    inputs = open('E://luntan_cut.txt', 'r', encoding='utf-8')
    outputs = open('E//luntan_stop.txt', 'a')
    for line in inputs:
        line_seg = seg_sentence(line)  # 這里的返回值是字符串
outputs.write(line_seg + '\n')
    outputs.close()
    inputs.close()

if __name__=="__main__":
    add_dict()
    cut()

免責聲明！

本站轉載的文章為個人學習借鑒使用，本站對版權不負任何法律責任。如果侵犯了您的隱私權益，請聯系本站郵箱yoyou2525@163.com刪除。

猜您在找 python jieba分詞（添加停用詞，用戶字典取詞頻 jieba文本分詞，去除停用詞，添加用戶詞 python去除停用詞（結巴分詞下） python利用jieba進行中文分詞去停用詞 python使用jieba實現中文文檔分詞和去停用詞 python 去停用詞 python調用jieba(結巴)分詞加入自定義詞典和去停用詞功能 [超詳細] Python3爬取豆瓣影評、去停用詞、詞雲圖、評論關鍵詞繪圖處理 NLTK 停用詞、罕見詞 [Python]pyhon去除txt文件重復行 python 2020.2.10

[Python]jieba切詞 添加字典 去除停用詞、單字 python 2020.2.10

免責聲明！

[Python]jieba切詞添加字典去除停用詞、單字 python 2020.2.10