每個文本文件包含相對應類的數據(0:喜悅1:憤怒2:厭惡3:低落對應不同類別的感情)
- 文本讀取
- 用均值的方差,在高斯分布里面計算某個詞的概率。
- 對文本特征進行提取,提取詞頻。
- 通過詞頻在各種詞頻目錄里面進行匹配。
- 對模型的准確率的預測。
main.py
1 # -*- coding: utf-8 -*- 2 import os 3 import pandas as pd 4 import nltk 5 from tools import proc_text, split_train_test, get_word_list_from_data, \ 6 extract_feat_from_data, cal_acc 7 from nltk.text import TextCollection 8 from sklearn.naive_bayes import GaussianNB 9 10 dataset_path = './dataset' 11 text_filenames = ['0_simplifyweibo.txt', '1_simplifyweibo.txt', 12 '2_simplifyweibo.txt', '3_simplifyweibo.txt'] 13 14 # 原始數據的csv文件 15 output_text_filename = 'raw_weibo_text.csv' 16 17 # 清洗好的文本數據文件 18 output_cln_text_filename = 'clean_weibo_text.csv' 19 20 # 處理和清洗文本數據的時間較長,通過設置is_first_run進行配置 21 # 如果是第一次運行需要對原始文本數據進行處理和清洗,需要設為True 22 # 如果之前已經處理了文本數據,並已經保存了清洗好的文本數據,設為False即可 23 is_first_run = True 24 25 26 def read_and_save_to_csv(): 27 """ 28 讀取原始文本數據,將標簽和文本數據保存成csv 29 """ 30 31 text_w_label_df_lst = [] 32 for text_filename in text_filenames: 33 text_file = os.path.join(dataset_path, text_filename) 34 35 # 獲取標簽,即0, 1, 2, 3 36 label = int(text_filename[0]) 37 38 # 讀取文本文件 39 with open(text_file, 'r', encoding='utf-8') as f: 40 lines = f.read().splitlines() 41 42 labels = [label] * len(lines) 43 44 text_series = pd.Series(lines) 45 label_series = pd.Series(labels) 46 47 # 構造dataframe 48 text_w_label_df = pd.concat([label_series, text_series], axis=1) 49 text_w_label_df_lst.append(text_w_label_df) 50 51 result_df = pd.concat(text_w_label_df_lst, axis=0) 52 53 # 保存成csv文件 54 result_df.columns = ['label', 'text'] 55 result_df.to_csv(os.path.join(dataset_path, output_text_filename), 56 index=None, encoding='utf-8') 57 58 59 def run_main(): 60 """ 61 主函數 62 """ 63 # 1. 數據讀取,處理,清洗,准備 64 if is_first_run: 65 print('處理清洗文本數據中...', end=' ') 66 # 如果是第一次運行需要對原始文本數據進行處理和清洗 67 68 # 讀取原始文本數據,將標簽和文本數據保存成csv 69 read_and_save_to_csv() 70 71 # 讀取處理好的csv文件,構造數據集 72 text_df = pd.read_csv(os.path.join(dataset_path, output_text_filename), 73 encoding='utf-8') 74 75 # 處理文本數據 76 text_df['text'] = text_df['text'].apply(proc_text) 77 78 # 過濾空字符串 79 text_df = text_df[text_df['text'] != ''] 80 81 # 保存處理好的文本數據 82 text_df.to_csv(os.path.join(dataset_path, output_cln_text_filename), 83 index=None, encoding='utf-8') 84 print('完成,並保存結果。') 85 86 # 2. 分割訓練集、測試集 87 print('加載處理好的文本數據') 88 clean_text_df = pd.read_csv(os.path.join(dataset_path, output_cln_text_filename), 89 encoding='utf-8') 90 # 分割訓練集和測試集 91 train_text_df, test_text_df = split_train_test(clean_text_df) 92 # 查看訓練集測試集基本信息 93 print('訓練集中各類的數據個數:', train_text_df.groupby('label').size()) 94 print('測試集中各類的數據個數:', test_text_df.groupby('label').size()) 95 96 # 3. 特征提取 97 # 計算詞頻 98 n_common_words = 200 99 100 # 將訓練集中的單詞拿出來統計詞頻 101 print('統計詞頻...') 102 all_words_in_train = get_word_list_from_data(train_text_df) 103 fdisk = nltk.FreqDist(all_words_in_train) 104 common_words_freqs = fdisk.most_common(n_common_words) 105 print('出現最多的{}個詞是:'.format(n_common_words)) 106 for word, count in common_words_freqs: 107 print('{}: {}次'.format(word, count)) 108 print() 109 110 # 在訓練集上提取特征 111 text_collection = TextCollection(train_text_df['text'].values.tolist()) 112 print('訓練樣本提取特征...', end=' ') 113 train_X, train_y = extract_feat_from_data(train_text_df, text_collection, common_words_freqs) 114 print('完成') 115 print() 116 117 print('測試樣本提取特征...', end=' ') 118 test_X, test_y = extract_feat_from_data(test_text_df, text_collection, common_words_freqs) 119 print('完成') 120 121 # 4. 訓練模型Naive Bayes 122 print('訓練模型...', end=' ') 123 gnb = GaussianNB() 124 gnb.fit(train_X, train_y) 125 print('完成') 126 print() 127 128 # 5. 預測 129 print('測試模型...', end=' ') 130 test_pred = gnb.predict(test_X) 131 print('完成') 132 133 # 輸出准確率 134 print('准確率:', cal_acc(test_y, test_pred)) 135 136 if __name__ == '__main__': 137 run_main()
tools.py
1 # -*- coding: utf-8 -*- 2 import re 3 import jieba.posseg as pseg 4 import pandas as pd 5 import math 6 import numpy as np 7 8 # 加載常用停用詞 9 stopwords1 = [line.rstrip() for line in open('./中文停用詞庫.txt', 'r', encoding='utf-8')] 10 # stopwords2 = [line.rstrip() for line in open('./哈工大停用詞表.txt', 'r', encoding='utf-8')] 11 # stopwords3 = [line.rstrip() for line in open('./四川大學機器智能實驗室停用詞庫.txt', 'r', encoding='utf-8')] 12 # stopwords = stopwords1 + stopwords2 + stopwords3 13 stopwords = stopwords1 14 15 16 def proc_text(raw_line): 17 """ 18 處理每行的文本數據 19 返回分詞結果 20 """ 21 # 1. 使用正則表達式去除非中文字符 22 filter_pattern = re.compile('[^\u4E00-\u9FD5]+') 23 chinese_only = filter_pattern.sub('', raw_line) 24 25 # 2. 結巴分詞+詞性標注 26 words_lst = pseg.cut(chinese_only) 27 28 # 3. 去除停用詞 29 meaninful_words = [] 30 for word, flag in words_lst: 31 # if (word not in stopwords) and (flag == 'v'): 32 # 也可根據詞性去除非動詞等 33 if word not in stopwords: 34 meaninful_words.append(word) 35 36 return ' '.join(meaninful_words) 37 38 39 def split_train_test(text_df, size=0.8): 40 """ 41 分割訓練集和測試集 42 """ 43 # 為保證每個類中的數據能在訓練集中和測試集中的比例相同,所以需要依次對每個類進行處理 44 train_text_df = pd.DataFrame() 45 test_text_df = pd.DataFrame() 46 47 labels = [0, 1, 2, 3] 48 for label in labels: 49 # 找出label的記錄 50 text_df_w_label = text_df[text_df['label'] == label] 51 # 重新設置索引,保證每個類的記錄是從0開始索引,方便之后的拆分 52 text_df_w_label = text_df_w_label.reset_index() 53 54 # 默認按80%訓練集,20%測試集分割 55 # 這里為了簡化操作,取前80%放到訓練集中,后20%放到測試集中 56 # 當然也可以隨機拆分80%,20%(嘗試實現下DataFrame中的隨機拆分) 57 58 # 該類數據的行數 59 n_lines = text_df_w_label.shape[0] 60 split_line_no = math.floor(n_lines * size) 61 text_df_w_label_train = text_df_w_label.iloc[:split_line_no, :] 62 text_df_w_label_test = text_df_w_label.iloc[split_line_no:, :] 63 64 # 放入整體訓練集,測試集中 65 train_text_df = train_text_df.append(text_df_w_label_train) 66 test_text_df = test_text_df.append(text_df_w_label_test) 67 68 train_text_df = train_text_df.reset_index() 69 test_text_df = test_text_df.reset_index() 70 return train_text_df, test_text_df 71 72 73 def get_word_list_from_data(text_df): 74 """ 75 將數據集中的單詞放入到一個列表中 76 """ 77 word_list = [] 78 for _, r_data in text_df.iterrows(): 79 word_list += r_data['text'].split(' ') 80 return word_list 81 82 83 def extract_feat_from_data(text_df, text_collection, common_words_freqs): 84 """ 85 特征提取 86 """ 87 # 這里只選擇TF-IDF特征作為例子 88 # 可考慮使用詞頻或其他文本特征作為額外的特征 89 90 n_sample = text_df.shape[0] 91 n_feat = len(common_words_freqs) 92 common_words = [word for word, _ in common_words_freqs] 93 94 # 初始化 95 X = np.zeros([n_sample, n_feat]) 96 y = np.zeros(n_sample) 97 98 print('提取特征...') 99 for i, r_data in text_df.iterrows(): 100 if (i + 1) % 5000 == 0: 101 print('已完成{}個樣本的特征提取'.format(i + 1)) 102 103 text = r_data['text'] 104 105 feat_vec = [] 106 for word in common_words: 107 if word in text: 108 # 如果在高頻詞中,計算TF-IDF值 109 tf_idf_val = text_collection.tf_idf(word, text) 110 else: 111 tf_idf_val = 0 112 113 feat_vec.append(tf_idf_val) 114 115 # 賦值 116 X[i, :] = np.array(feat_vec) 117 y[i] = int(r_data['label']) 118 119 return X, y 120 121 122 def cal_acc(true_labels, pred_labels): 123 """ 124 計算准確率 125 """ 126 n_total = len(true_labels) 127 correct_list = [true_labels[i] == pred_labels[i] for i in range(n_total)] 128 129 acc = sum(correct_list) / n_total 130 return acc
