python 進行結巴分詞 並且用re去掉符號


# 把停用詞做成字典
stopwords = {}
fstop = open('stop_words.txt', 'r',encoding='utf-8',errors='ingnore')
for eachWord in fstop:
stopwords[eachWord.strip()] = eachWord.strip() #停用詞典
fstop.close()

f1=open('all.txt','r',encoding='utf-8',errors='ignore')
f2=open('allutf11.txt','w',encoding='utf-8')

line=f1.readline()
while line:
line = line.strip() #去前后的空格
line = re.sub(r"[0-9\s+\.\!\/_,$%^*()?;;:-【】+\"\']+|[+——!,;:。?、~@#¥%……&*()]+", " ", line) #去標點符號
seg_list=jieba.cut(line,cut_all=False) #結巴分詞
outStr=""
for word in seg_list:
if word not in stopwords:
outStr+=word
outStr+=" "
f2.write(outStr)
    line=f1.readline()
f1.close()
f2.close()





免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM