手記實用系列文章:
3 自然語言處理手記
語料預處理封裝類:
#coding=utf-8
import os
import jieba
import sys
import re
import time
import jieba.posseg as pseg
sys.path.append("../")
jieba.load_userdict("../Database/userdict.txt") # 加載自定義分詞詞典
'''
title:利用結巴分詞進行文本語料處理:單文本處理器、批量文件處理器
1 首先對文本進行遍歷查找
2 創建原始文本的保存結構
3 對原文本進行結巴分詞和停用詞處理
4 對預處理結果進行標准化格式,並保存原文件結構路徑
author:白寧超
myblog:http://www.cnblogs.com/baiboy/
'''
'''
分詞.詞性標注以及去停用詞
stopwordspath: 停用詞路徑
dealpath:中文數據預處理文件的路徑
savepath:中文數據預處理結果的保存路徑
'''
def cutTxtWord(dealpath,savepath,stopwordspath):
stopwords = {}.fromkeys([ line.rstrip() for line in open(stopwordspath,"r",encoding='utf-8')]) # 停用詞表
with open(dealpath,"r",encoding='utf-8') as f:
txtlist=f.read() # 讀取待處理的文本
words =pseg.cut(txtlist) # 帶詞性標注的分詞結果
cutresult=""# 獲取去除停用詞后的分詞結果
for word, flag in words:
if word not in stopwords:
cutresult += word+"/"+flag+" " #去停用詞
getFlag(cutresult,savepath) #
'''
分詞.詞性標注以及去停用詞
stopwordspath: 停用詞路徑
read_folder_path :中文數據預處理文件的路徑
write_folder_path :中文數據預處理結果的保存路徑
filescount=300 #設置文件夾下文件最多多少個
'''
def cutFileWord(read_folder_path,write_folder_path,stopwordspath):
# 停用詞表
stopwords = {}.fromkeys([ line.rstrip() for line in open(stopwordspath,"r",encoding='utf-8')])
# 獲取待處理根目錄下的所有類別
folder_list = os.listdir(read_folder_path)
# 類間循環
for folder in folder_list:
#某類下的路徑
new_folder_path = os.path.join(read_folder_path, folder)
# 創建保存文件目錄
path=write_folder_path+folder #保存文件的子文件
isExists=os.path.exists(path)
if not isExists:
os.makedirs(path)
print(path+' 創建成功')
else: pass
save_folder_path = os.path.join(write_folder_path, folder)#某類下的保存路徑
print('--> 請稍等,正在處理中...')
# 類內循環
files = os.listdir(new_folder_path)
j = 1
for file in files:
if j > len(files): break
dealpath = os.path.join(new_folder_path, file) #處理單個文件的路徑
with open(dealpath,"r",encoding='utf-8') as f:
txtlist=f.read()
# python 過濾中文、英文標點特殊符號
# txtlist1 = re.sub("[\s+\.\!\/_,$%^*(+\"\']+|[+——!,。?、~@#¥%……&*()]+", "",txtlist)
words =pseg.cut(txtlist) # 帶詞性標注的分詞結果
cutresult="" # 單個文本:分詞后經停用詞處理后的結果
for word, flag in words:
if word not in stopwords:
cutresult += word+"/"+flag+" " #去停用詞
savepath = os.path.join(save_folder_path,file)
getFlag(cutresult,savepath)
j += 1
'''
做詞性篩選
cutresult:str類型,初切分的結果
savepath: 保存文件路徑
'''
def getFlag(cutresult,savepath):
txtlist=[] #過濾掉的詞性后的結果
#詞列表為自己定義要過濾掉的詞性
cixing=["/x","/zg","/uj","/ul","/e","/d","/uz","/y"]
for line in cutresult.split('\n'):
line_list2=re.split('[ ]', line)
line_list2.append("\n") # 保持原段落格式存在
line_list=line_list2[:]
for segs in line_list2:
for K in cixing:
if K in segs:
line_list.remove(segs)
break
else:
pass
txtlist.extend(line_list)
# 去除詞性標簽
resultlist=txtlist[:]
flagresult=""
for v in txtlist:
if "/" in v:
slope=v.index("/")
letter=v[0:slope]+" "
flagresult+= letter
else:
flagresult+= v
standdata(flagresult,savepath)
'''
標准化處理,去除空行,空白字符等。
flagresult:篩選過的結果
'''
def standdata(flagresult,savepath):
f2=open(savepath,"w",encoding='utf-8')
for line in flagresult.split('\n'):
if len(line)>=2:
line_clean="/ ".join(line.split())
lines=line_clean+" "+"\n"
f2.write(lines)
else: pass
f2.close()
if __name__ == '__main__' :
t1=time.time()
# 測試單個文件
dealpath="../Database/SogouC/FileTest/1.txt"
savepath="../Database/SogouCCut/FileTest/1.txt"
stopwordspath='../Database/stopwords/CH_stopWords.txt'
stopwordspath1='../Database/stopwords/HG_stopWords.txt' # 哈工大停用詞表
# 批量處理文件夾下的文件
# rfolder_path = '../Database/SogouC/Sample/'
rfolder_path = '../Database/SogouC/FileNews/'
# 分詞處理后保存根路徑
wfolder_path = '../Database/SogouCCut/'
# 中文語料預處理器
# cutTxtWord(dealpath,savepath,stopwordspath) # 單文本預處理器
cutFileWord(rfolder_path,wfolder_path,stopwordspath) # 多文本預處理器
t2=time.time()
print("中文語料語處理完成,耗時:"+str(t2-t1)+"秒。") #反饋結果
執行結果:

