源碼如下:
1 import jieba 2 import io 3 import re 4 5 #jieba.load_userdict("E:/xinxi2.txt") 6 patton=re.compile(r'..') 7 8 #添加字典 9 def add_dict(): 10 f=open("E:/xinxi2.txt","r+",encoding="utf-8") #百度爬取的字典 11 for line in f: 12 jieba.suggest_freq(line.rstrip("\n"), True) 13 f.close() 14 15 #對句子進行分詞 16 def cut(): 17 number=0 18 f=open("E:/luntan.txt","r+",encoding="utf-8") #要處理的內容,所爬信息,CSDN論壇標題 19 for line in f: 20 line=seg_sentence(line.rstrip("\n")) 21 seg_list=jieba.cut(line) 22 for i in seg_list: 23 print(i) #打印詞匯內容 24 m=patton.findall(i) 25 #print(len(m)) #打印字符長度 26 if len(m)!=0: 27 write(i.strip()+" ") 28 line=line.rstrip().lstrip() 29 print(len(line))#打印句子長度 30 if len(line)>1: 31 write("\n") 32 number+=1 33 print("已處理",number,"行") 34 35 #分詞后寫入 36 def write(contents): 37 f=open("E://luntan_cut2.txt","a+",encoding="utf-8") #要寫入的文件 38 f.write(contents) 39 #print("寫入成功!") 40 f.close() 41 42 #創建停用詞 43 def stopwordslist(filepath): 44 stopwords = [line.strip() for line in open(filepath, 'r', encoding='utf-8').readlines()] 45 return stopwords 46 47 # 對句子進行去除停用詞 48 def seg_sentence(sentence): 49 sentence_seged = jieba.cut(sentence.strip()) 50 stopwords = stopwordslist('E://stop.txt') # 這里加載停用詞的路徑 51 outstr = '' 52 for word in sentence_seged: 53 if word not in stopwords: 54 if word != '\t': 55 outstr += word 56 #outstr += " " 57 return outstr 58 59 #循環去除、無用函數 60 def cut_all(): 61 inputs = open('E://luntan_cut.txt', 'r', encoding='utf-8') 62 outputs = open('E//luntan_stop.txt', 'a') 63 for line in inputs: 64 line_seg = seg_sentence(line) # 這里的返回值是字符串 65 outputs.write(line_seg + '\n') 66 outputs.close() 67 inputs.close() 68 69 if __name__=="__main__": 70 add_dict() 71 cut()
luntan.txt的來源,地址:https://www.cnblogs.com/zlc364624/p/12285055.html
其中停用詞可自行百度下載,或者自己創建一個txt文件夾,自行添加詞匯用換行符隔開。
百度爬取的字典在前幾期博客中可以找到,地址:https://www.cnblogs.com/zlc364624/p/12289008.html
效果如下:
import jieba
import io
import re
#jieba.load_userdict("E:/xinxi2.txt")
patton=re.compile(r'..')
#添加字典
def add_dict():
f=open("E:/xinxi2.txt","r+",encoding="utf-8") #百度爬取的字典
for line in f:
jieba.suggest_freq(line.rstrip("\n"), True)
f.close()
#對句子進行分詞
def cut():
number=0
f=open("E:/luntan.txt","r+",encoding="utf-8") #要處理的內容,所爬信息,CSDN論壇標題
for line in f:
line=seg_sentence(line.rstrip("\n"))
seg_list=jieba.cut(line)
for i in seg_list:
print(i) #打印詞匯內容
m=patton.findall(i)
#print(len(m)) #打印字符長度
if len(m)!=0:
write(i.strip()+" ")
line=line.rstrip().lstrip()
print(len(line))#打印句子長度
if len(line)>1:
write("\n")
number+=1
print("已處理",number,"行")
#分詞后寫入
def write(contents):
f=open("E://luntan_cut2.txt","a+",encoding="utf-8") #要寫入的文件
f.write(contents)
#print("寫入成功!")
f.close()
#創建停用詞
def stopwordslist(filepath):
stopwords = [line.strip() for line in open(filepath, 'r', encoding='utf-8').readlines()]
return stopwords
# 對句子進行去除停用詞
def seg_sentence(sentence):
sentence_seged = jieba.cut(sentence.strip())
stopwords = stopwordslist('E://stop.txt') # 這里加載停用詞的路徑
outstr = ''
for word in sentence_seged:
if word not in stopwords:
if word != '\t':
outstr += word
#outstr += " "
return outstr
#循環去除、無用函數
def cut_all():
inputs = open('E://luntan_cut.txt', 'r', encoding='utf-8')
outputs = open('E//luntan_stop.txt', 'a')
for line in inputs:
line_seg = seg_sentence(line) # 這里的返回值是字符串
outputs.write(line_seg + '\n')
outputs.close()
inputs.close()
if __name__=="__main__":
add_dict()
cut()