朴素貝葉斯算法簡單、高效。接下來我們來介紹其如何應用在《紅樓夢》作者的鑒別上。
第一步,當然是先得有文本數據,我在網上隨便下載了一個txt(當時急着交初稿。。。)。分類肯定是要一個回合一個回合的分,所以我們拿到文本數據后,先進行回合划分。然后就是去標點符號、分詞,做詞頻統計。
1 # -*- coding: utf-8 -*- 2 import re 3 import jieba 4 import string 5 import collections as coll 6 jieba.load_userdict('E:\\forpython\\紅樓夢詞匯大全.txt') # 導入搜狗的紅樓夢詞庫 7 8 9 class textprocesser: 10 def __init__(self): 11 pass 12 13 # 將小說分成120個章節並單獨保存到txt文件中 14 def divide_into_chapter(self): 15 red=open('E:\\forpython\\紅樓夢.txt',encoding='utf-8') 16 each_line = red.readline() 17 chapter_count = 0 18 chapter_text = '' 19 complied_rule = re.compile('第[一二三四五六七八九十百]+回 ') 20 21 while each_line: 22 if re.findall(complied_rule,each_line): 23 file_name = 'chap'+str(chapter_count) 24 file_out = open('E:\\forpython\\chapters\\'+file_name+'.txt','a',encoding = 'utf-8') 25 file_out.write(chapter_text) 26 chapter_count += 1 27 file_out.close() 28 chapter_text = each_line 29 else: 30 chapter_text += each_line 31 32 each_line = red.readline() 33 34 red.close() 35 36 37 # 對單個章節的分詞 38 def segmentation(self,text,text_count): 39 file_name = 'chap'+str(text_count)+'-words.txt' 40 file_out = open('E:\\forpython\\chapter2words\\'+file_name,'a',encoding='utf-8') 41 delset = string.punctuation 42 43 line=text.readline() 44 45 while line: 46 seg_list = jieba.cut(line,cut_all = False) 47 words = " ".join(seg_list) 48 words = words.translate(delset) # 去除英文標點 49 words = "".join(words.split('\n')) # 去除回車符 50 words = self.delCNf(words) # 去除中文標點 51 words = re.sub('[ \u3000]+',' ',words) # 去除多余的空格 52 file_out.write(words) 53 line = text.readline() 54 55 file_out.close() 56 text.close() 57 58 59 # 對所有章節分詞 60 def do_segmentation(self): 61 for loop in range(1,121): 62 file_name = 'chap'+str(loop)+'.txt' 63 file_in = open('E:\\forpython\\chapters\\'+file_name,'r',encoding = 'utf-8') 64 65 self.segmentation(file_in,loop) 66 67 file_in.close() 68 69 # 去除中文字符函數 70 def delCNf(self,line): 71 regex = re.compile('[^\u4e00-\u9fa5a-zA-Z0-9\s]') 72 return regex.sub('', line) 73 74 75 # 去除標點后進行詞頻統計 76 def count_words(self,text,textID): 77 line = str(text) 78 words = line.split() 79 words_dict = coll.Counter(words) # 生成詞頻字典 80 81 file_name = 'chap'+str(textID)+'-wordcount.txt' 82 file_out = open('E:\\forpython\\chapter-wordcount\\'+file_name,'a',encoding = 'utf-8') 83 84 # 排序后寫入文本 85 sorted_result = sorted(words_dict.items(),key = lambda d:d[1],reverse = True) 86 for one in sorted_result: 87 line = "".join(one[0] + '\t' + str(one[1]) + '\n') 88 file_out.write(line) 89 90 file_out.close() 91 92 93 94 def do_wordcount(self): 95 for loop in range(1,121): 96 file_name = 'chap'+str(loop)+'-words.txt' 97 file_in = open('E:\\forpython\\chapter2words\\'+file_name,'r',encoding = 'utf-8') 98 line = file_in.readline() 99 100 text = '' 101 while line: 102 text += line 103 line = file_in.readline() 104 self.count_words(text,loop) 105 file_in.close() 106 107 108 if __name__ == '__main__': 109 processer = textprocesser() 110 processer.divide_into_chapter() 111 processer.do_segmentation() 112 processer.do_wordcount()
文本分類我個人感覺最重要的是選取特征向量,我查閱了相關文獻,決定選取五十多個文言虛詞和二十多個在120個回合中均出現過的詞匯(文言虛詞的使用不受情節影響,只與作者寫作習慣有關)。下面是生成
特征向量的代碼
1 # -*- coding: utf-8 -*- 2 import jieba 3 import re 4 import string 5 import collections as coll 6 jieba.load_userdict('E:\\forpython\\紅樓夢詞匯大全.txt') # 導入搜狗的紅樓夢詞庫 7 8 class featureVector: 9 def __init__(self): 10 pass 11 12 # 去除中文字符函數 13 def delCNf(self,line): 14 regex = re.compile('[^\u4e00-\u9fa5a-zA-Z0-9\s]') 15 return regex.sub('', line) 16 17 18 # 對整篇文章分詞 19 def cut_words(self): 20 red = open('E:\\forpython\\紅樓夢.txt','r',encoding = 'utf-8') 21 file_out = open('E:\\forpython\\紅樓夢-詞.txt','a',encoding = 'utf-8') 22 delset = string.punctuation 23 24 line = red.readline() 25 26 while line: 27 seg_list = jieba.cut(line,cut_all = False) 28 words = ' '.join(seg_list) 29 words = words.translate(delset) # 去除英文標點 30 words = "".join(words.split('\n')) # 去除回車符 31 words = self.delCNf(words) # 去除中文標點 32 words = re.sub('[ \u3000]+',' ',words) # 去除多余的空格 33 file_out.write(words) 34 line = red.readline() 35 36 file_out.close() 37 red.close() 38 39 # 統計詞頻 40 def count_words(self): 41 data = open('E:\\forpython\\紅樓夢-詞.txt','r',encoding = 'utf-8') 42 line = data.read() 43 data.close() 44 words = line.split() 45 words_dict = coll.Counter(words) # 生成詞頻字典 46 47 file_out = open('E:\\forpython\\紅樓夢-詞頻.txt','a',encoding = 'utf-8') 48 49 # 排序后寫入文本 50 sorted_result = sorted(words_dict.items(),key = lambda d:d[1],reverse = True) 51 for one in sorted_result: 52 line = "".join(one[0] + '\t' + str(one[1]) + '\n') 53 file_out.write(line) 54 55 file_out.close() 56 57 58 59 def get_featureVector(self): 60 # 將分詞后的120個章節文本放入一個列表中 61 everychapter = [] 62 for loop in range(1,121): 63 data = open('E:\\forpython\\chapter2words\\chap'+str(loop)+'-words.txt','r',encoding = 'utf-8') 64 each_chapter = data.read() 65 everychapter.append(each_chapter) 66 data.close() 67 68 temp = open('E:\\forpython\\紅樓夢-詞.txt','r',encoding = 'utf-8') 69 word_beg = temp.read() 70 word_beg = word_beg.split(' ') 71 temp.close() 72 73 # 找出每一個回合都出現的詞 74 cleanwords = [] 75 for loop in range(1,121): 76 data = open('E:\\forpython\\chapter2words\\chap'+str(loop)+'-words.txt','r',encoding = 'utf-8') 77 words_list = list(set(data.read().split())) 78 data.close() 79 cleanwords.extend(words_list) 80 81 cleanwords_dict = coll.Counter(cleanwords) 82 83 cleanwords_dict = {k:v for k, v in cleanwords_dict.items() if v >= 120} 84 85 cleanwords_f = list(cleanwords_dict.keys()) 86 87 xuci = open('E:\\forpython\\文言虛詞.txt','r',encoding = 'utf-8') 88 xuci_list = xuci.read().split() 89 xuci.close() 90 featureVector = list(set(xuci_list + cleanwords_f)) 91 featureVector.remove('\ufeff') 92 93 # 寫入文本 94 file_out = open('E:\\forpython\\紅樓夢-特征向量.txt','a',encoding = 'utf-8') 95 for one in featureVector: 96 line = "".join(one+ '\n') 97 file_out.write(line) 98 99 file_out.close() 100 return(featureVector) 101 102 if __name__ == '__main__': 103 vectorbuilter = featureVector() 104 vectorbuilter.cut_words() 105 vectorbuilter.count_words() 106 vectorbuilter.get_featureVector()
朴素貝葉斯文本分類就是用特征向量的詞頻作為每個回合的代表(偷個懶,直接截圖答辯的ppt)
用特征向量把所有一百二十個回合向量化后,你會得到120×70的一個數組。接下來就簡單了。直接挑選訓練集,在這我是在前80回中挑選了20至29回標記為第一類(用數字1表示),並將其作為第一類的訓練集;在后80回合中挑選了110至119回標記為第二類(用數字2表示),並將其作為第二類的訓練集。
1 # -*- coding: utf-8 -*- 2 3 import numpy as np 4 from sklearn.naive_bayes import MultinomialNB 5 import get_trainset as ts 6 x_train = ts.get_train_set().get_all_vector() 7 8 9 10 class result: 11 def __inti__(self): 12 pass 13 14 def have_Xtrainset(self): 15 Xtrainset = x_train 16 Xtrainset = np.vstack((Xtrainset[19:29],Xtrainset[109:119])) 17 return(Xtrainset) 18 19 def as_num(self,x): 20 y='{:.10f}'.format(x) 21 return(y) 22 23 def built_model(self): 24 x_trainset = self.have_Xtrainset() 25 y_classset = np.repeat(np.array([1,2]),[10,10]) 26 27 NBclf = MultinomialNB() 28 NBclf.fit(x_trainset,y_classset) # 建立模型 29 30 all_vector = x_train 31 32 result = NBclf.predict(all_vector) 33 print('前'+str(len(result[0:80]))+'回分類結果為:') 34 print(result[0:80]) 35 print('后'+str(len(result[80:121]))+'回分類結果為:') 36 print(result[80:121]) 37 38 diff_chapter = [80,81,83,84,87,88,90,100] 39 for i in diff_chapter: 40 tempr = NBclf.predict_proba(all_vector[i]) 41 print('第'+str(i+1)+'回的分類概率為: ') 42 print(str(self.as_num(tempr[0][0]))+' '+str(self.as_num(tempr[0][1]))) 43 44 45 if __name__ == '__main__': 46 res = result() 47 res.built_model()
上面是直接調用了skit-learn的MultinomialNB函數,詳細情況我在前一篇中講過。
得到分類結果:
從最終的分類結果來看,在第82回合左右是有一個比較明顯的分界點,這樣看來前80回合與后40回合在寫作風格上還是有顯著的差異的,這個結果和紅樓夢學術界的年的推斷比較一致。
至於為何在后40回中有8個回合被分到1類中,這8個回合分別是81回、82回、84回、85回、88回、89回、91回還有101回,都是在第80回合附近,這個差異有可能是由於上下文的銜接所導致的,因為本文所使用的《紅樓夢》文本是從網上下載得到的,,版本不明,所以也有可能是由於紅樓夢的版本所導致的。
代碼肯定還有很多可以優化的地方,在這里獻丑了。。。。