微博情感分析


每個文本文件包含相對應類的數據(0:喜悅1:憤怒2:厭惡3:低落對應不同類別的感情)

  1. 文本讀取
  2. 用均值的方差,在高斯分布里面計算某個詞的概率。
  3. 對文本特征進行提取,提取詞頻。
  4. 通過詞頻在各種詞頻目錄里面進行匹配。
  5. 對模型的准確率的預測。

main.py

  1 # -*- coding: utf-8 -*-
  2 import os
  3 import pandas as pd
  4 import nltk
  5 from tools import proc_text, split_train_test, get_word_list_from_data, \
  6     extract_feat_from_data, cal_acc
  7 from nltk.text import TextCollection
  8 from sklearn.naive_bayes import GaussianNB
  9 
 10 dataset_path = './dataset'
 11 text_filenames = ['0_simplifyweibo.txt', '1_simplifyweibo.txt',
 12                   '2_simplifyweibo.txt', '3_simplifyweibo.txt']
 13 
 14 # 原始數據的csv文件
 15 output_text_filename = 'raw_weibo_text.csv'
 16 
 17 # 清洗好的文本數據文件
 18 output_cln_text_filename = 'clean_weibo_text.csv'
 19 
 20 # 處理和清洗文本數據的時間較長,通過設置is_first_run進行配置
 21 # 如果是第一次運行需要對原始文本數據進行處理和清洗,需要設為True
 22 # 如果之前已經處理了文本數據,並已經保存了清洗好的文本數據,設為False即可
 23 is_first_run = True
 24 
 25 
 26 def read_and_save_to_csv():
 27     """
 28         讀取原始文本數據,將標簽和文本數據保存成csv
 29     """
 30 
 31     text_w_label_df_lst = []
 32     for text_filename in text_filenames:
 33         text_file = os.path.join(dataset_path, text_filename)
 34 
 35         # 獲取標簽,即0, 1, 2, 3
 36         label = int(text_filename[0])
 37 
 38         # 讀取文本文件
 39         with open(text_file, 'r', encoding='utf-8') as f:
 40             lines = f.read().splitlines()
 41 
 42         labels = [label] * len(lines)
 43 
 44         text_series = pd.Series(lines)
 45         label_series = pd.Series(labels)
 46 
 47         # 構造dataframe
 48         text_w_label_df = pd.concat([label_series, text_series], axis=1)
 49         text_w_label_df_lst.append(text_w_label_df)
 50 
 51     result_df = pd.concat(text_w_label_df_lst, axis=0)
 52 
 53     # 保存成csv文件
 54     result_df.columns = ['label', 'text']
 55     result_df.to_csv(os.path.join(dataset_path, output_text_filename),
 56                      index=None, encoding='utf-8')
 57 
 58 
 59 def run_main():
 60     """
 61         主函數
 62     """
 63     # 1. 數據讀取,處理,清洗,准備
 64     if is_first_run:
 65         print('處理清洗文本數據中...', end=' ')
 66         # 如果是第一次運行需要對原始文本數據進行處理和清洗
 67 
 68         # 讀取原始文本數據,將標簽和文本數據保存成csv
 69         read_and_save_to_csv()
 70 
 71         # 讀取處理好的csv文件,構造數據集
 72         text_df = pd.read_csv(os.path.join(dataset_path, output_text_filename),
 73                               encoding='utf-8')
 74 
 75         # 處理文本數據
 76         text_df['text'] = text_df['text'].apply(proc_text)
 77 
 78         # 過濾空字符串
 79         text_df = text_df[text_df['text'] != '']
 80 
 81         # 保存處理好的文本數據
 82         text_df.to_csv(os.path.join(dataset_path, output_cln_text_filename),
 83                        index=None, encoding='utf-8')
 84         print('完成,並保存結果。')
 85 
 86     # 2. 分割訓練集、測試集
 87     print('加載處理好的文本數據')
 88     clean_text_df = pd.read_csv(os.path.join(dataset_path, output_cln_text_filename),
 89                                 encoding='utf-8')
 90     # 分割訓練集和測試集
 91     train_text_df, test_text_df = split_train_test(clean_text_df)
 92     # 查看訓練集測試集基本信息
 93     print('訓練集中各類的數據個數:', train_text_df.groupby('label').size())
 94     print('測試集中各類的數據個數:', test_text_df.groupby('label').size())
 95 
 96     # 3. 特征提取
 97     # 計算詞頻
 98     n_common_words = 200
 99 
100     # 將訓練集中的單詞拿出來統計詞頻
101     print('統計詞頻...')
102     all_words_in_train = get_word_list_from_data(train_text_df)
103     fdisk = nltk.FreqDist(all_words_in_train)
104     common_words_freqs = fdisk.most_common(n_common_words)
105     print('出現最多的{}個詞是:'.format(n_common_words))
106     for word, count in common_words_freqs:
107         print('{}: {}次'.format(word, count))
108     print()
109 
110     # 在訓練集上提取特征
111     text_collection = TextCollection(train_text_df['text'].values.tolist())
112     print('訓練樣本提取特征...', end=' ')
113     train_X, train_y = extract_feat_from_data(train_text_df, text_collection, common_words_freqs)
114     print('完成')
115     print()
116 
117     print('測試樣本提取特征...', end=' ')
118     test_X, test_y = extract_feat_from_data(test_text_df, text_collection, common_words_freqs)
119     print('完成')
120 
121     # 4. 訓練模型Naive Bayes
122     print('訓練模型...', end=' ')
123     gnb = GaussianNB()
124     gnb.fit(train_X, train_y)
125     print('完成')
126     print()
127 
128     # 5. 預測
129     print('測試模型...', end=' ')
130     test_pred = gnb.predict(test_X)
131     print('完成')
132 
133     # 輸出准確率
134     print('准確率:', cal_acc(test_y, test_pred))
135 
136 if __name__ == '__main__':
137     run_main()

tools.py

  1 # -*- coding: utf-8 -*-
  2 import re
  3 import jieba.posseg as pseg
  4 import pandas as pd
  5 import math
  6 import numpy as np
  7 
  8 # 加載常用停用詞
  9 stopwords1 = [line.rstrip() for line in open('./中文停用詞庫.txt', 'r', encoding='utf-8')]
 10 # stopwords2 = [line.rstrip() for line in open('./哈工大停用詞表.txt', 'r', encoding='utf-8')]
 11 # stopwords3 = [line.rstrip() for line in open('./四川大學機器智能實驗室停用詞庫.txt', 'r', encoding='utf-8')]
 12 # stopwords = stopwords1 + stopwords2 + stopwords3
 13 stopwords = stopwords1
 14 
 15 
 16 def proc_text(raw_line):
 17     """
 18         處理每行的文本數據
 19         返回分詞結果
 20     """
 21     # 1. 使用正則表達式去除非中文字符
 22     filter_pattern = re.compile('[^\u4E00-\u9FD5]+')
 23     chinese_only = filter_pattern.sub('', raw_line)
 24 
 25     # 2. 結巴分詞+詞性標注
 26     words_lst = pseg.cut(chinese_only)
 27 
 28     # 3. 去除停用詞
 29     meaninful_words = []
 30     for word, flag in words_lst:
 31         # if (word not in stopwords) and (flag == 'v'):
 32             # 也可根據詞性去除非動詞等
 33         if word not in stopwords:
 34             meaninful_words.append(word)
 35 
 36     return ' '.join(meaninful_words)
 37 
 38 
 39 def split_train_test(text_df, size=0.8):
 40     """
 41         分割訓練集和測試集
 42     """
 43     # 為保證每個類中的數據能在訓練集中和測試集中的比例相同,所以需要依次對每個類進行處理
 44     train_text_df = pd.DataFrame()
 45     test_text_df = pd.DataFrame()
 46 
 47     labels = [0, 1, 2, 3]
 48     for label in labels:
 49         # 找出label的記錄
 50         text_df_w_label = text_df[text_df['label'] == label]
 51         # 重新設置索引,保證每個類的記錄是從0開始索引,方便之后的拆分
 52         text_df_w_label = text_df_w_label.reset_index()
 53 
 54         # 默認按80%訓練集,20%測試集分割
 55         # 這里為了簡化操作,取前80%放到訓練集中,后20%放到測試集中
 56         # 當然也可以隨機拆分80%,20%(嘗試實現下DataFrame中的隨機拆分)
 57 
 58         # 該類數據的行數
 59         n_lines = text_df_w_label.shape[0]
 60         split_line_no = math.floor(n_lines * size)
 61         text_df_w_label_train = text_df_w_label.iloc[:split_line_no, :]
 62         text_df_w_label_test = text_df_w_label.iloc[split_line_no:, :]
 63 
 64         # 放入整體訓練集,測試集中
 65         train_text_df = train_text_df.append(text_df_w_label_train)
 66         test_text_df = test_text_df.append(text_df_w_label_test)
 67 
 68     train_text_df = train_text_df.reset_index()
 69     test_text_df = test_text_df.reset_index()
 70     return train_text_df, test_text_df
 71 
 72 
 73 def get_word_list_from_data(text_df):
 74     """
 75         將數據集中的單詞放入到一個列表中
 76     """
 77     word_list = []
 78     for _, r_data in text_df.iterrows():
 79         word_list += r_data['text'].split(' ')
 80     return word_list
 81 
 82 
 83 def extract_feat_from_data(text_df, text_collection, common_words_freqs):
 84     """
 85         特征提取
 86     """
 87     # 這里只選擇TF-IDF特征作為例子
 88     # 可考慮使用詞頻或其他文本特征作為額外的特征
 89 
 90     n_sample = text_df.shape[0]
 91     n_feat = len(common_words_freqs)
 92     common_words = [word for word, _ in common_words_freqs]
 93 
 94     # 初始化
 95     X = np.zeros([n_sample, n_feat])
 96     y = np.zeros(n_sample)
 97 
 98     print('提取特征...')
 99     for i, r_data in text_df.iterrows():
100         if (i + 1) % 5000 == 0:
101             print('已完成{}個樣本的特征提取'.format(i + 1))
102 
103         text = r_data['text']
104 
105         feat_vec = []
106         for word in common_words:
107             if word in text:
108                 # 如果在高頻詞中,計算TF-IDF值
109                 tf_idf_val = text_collection.tf_idf(word, text)
110             else:
111                 tf_idf_val = 0
112 
113             feat_vec.append(tf_idf_val)
114 
115         # 賦值
116         X[i, :] = np.array(feat_vec)
117         y[i] = int(r_data['label'])
118 
119     return X, y
120 
121 
122 def cal_acc(true_labels, pred_labels):
123     """
124         計算准確率
125     """
126     n_total = len(true_labels)
127     correct_list = [true_labels[i] == pred_labels[i] for i in range(n_total)]
128 
129     acc = sum(correct_list) / n_total
130     return acc

 


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM