# -*- coding:utf8 -*- import os import jieba def splitSentence(inputFile): fin = open(inputFile, 'r') #以讀的方式打開文件 global fout #以寫得方式打開文件 #print fin global stop for eachLine in fin: #print eachLine line = eachLine.strip()#.decode('utf-8', 'ignore') #去除每行首尾可能出現的空格,並轉為Unicode進行處理 line=line.strip('\n') #去掉多余空行 wordList = list(jieba.cut(line)) #用結巴分詞,對每行內容進行分詞 #wordList = list(jieba.cut_for_search(line)) outStr = '' for word in wordList:# if len(word)>1: if not word in stop: outStr += word outStr += ' ' fout.write(outStr.strip().encode('utf-8')) #將分詞好的結果寫入到輸出文件 fout.write('\n') fin.close() #path=r'/media/軟件/zhuomian/VARandLDAr/train' #r'D:/zhuomian/VARandLDA/train' path='/home/xdj/train' fns=[os.path.join(root,fn) for root,dirs,files in os.walk(path) for fn in files] stop = [line.strip().decode('utf-8', 'ignore') for line in open('/home/xdj/chstop.txt').readlines()] fout = open('myOutput.txt', 'w') fout.write('%d' %len(fns)+'\n') for f in fns: splitSentence(f) #splitSentence('/home/xdj/train/C3-Art/C3-Art1459.txt', 'myOutput.txt') print(len(fns)) fout.close()