一、環境搭建
1、安裝第三方包nltk
pip intall nltk==3.4.5
2、安裝 nltk_data
nltk_data 存放了很多語料數據, 包括大量的數據集,本文中就是用到了其中的 positive_tweets 和 negative_tweets 兩個數據集來訓練模型
安裝方式有兩種, 離線和在線, 推薦【使用離線】, 因為數據量很大, 在線下載通常會失敗
[ a ] 在線下載
python交互式命令行中輸入
import nltk nltk.download()
執行后會彈出下載窗口, 如果不需要全量下載, 選擇對應分類下, 進行點擊下載即可,
下載成功后會相應提示 installed 或者指定包進行下載, 同樣還是python交互式命令行輸入
import nltk
nltk.download('punkt')
[ b ] 離線下載 (推薦使用)
GitHub 下載地址:https://github.com/nltk/nltk_data
主要用到的是packages 文件夾下的內容
這就是全部的nltk_data 的內容
下載后需要進行簡單配置
1、 將下載的packages 文件夾重命名為nltk_data
2、將重命名后的nltk_data文件夾放置到nltk可以找到的路徑下, 查看方法為 :
>>>from nltk import data >>>data.find('.') FileSystemPathPointer('C:\\Users\\用戶\\AppData\\Roaming\\nltk_data') # 會輸出本地加載路徑, 直接放置在Roaming下即可
或者出現以下輸出, 將nltk_data文件夾放在任意目錄下也可以
到此, 環境就已經准備好啦~~~
二、步驟
-
分詞
-
數據預處理
-
構造模型數據
-
訓練模型
-
使用模型進行預測分析
三、加載數據集
導包
import re import string import random from nltk.corpus import twitter_samples from nltk.tag import pos_tag, pos_tag_sents from nltk.stem.wordnet import WordNetLemmatizer from nltk.corpus import stopwords from nltk import FreqDist
加載數據集:
使用的是twitter_samples下的 negative_tweets.json:【5000條帶有負面情感的推文】 和 positive_tweets.json:【5000條帶有正面情感的推文 用於訓練模型】,可以講壓縮包解壓出來, 看下里面的json 文件的內容
使用代碼獲取推文內容
positive_tweets = twitter_samples.strings('positive_tweets.json')
negative_tweets = twitter_samples.strings('negative_tweets.json') # 可以打印看下分析的推文內容
四、分詞
# 分詞 po_fenci_res = fenci(po_file_name)[:2] be_fenci_res = fenci(ne_file_name)[:2] # 數據量比較大, 所以僅取前2條 print('積極分詞結果: {}'.format(po_fenci_res)) print('消極分詞結果: {}'.format(be_fenci_res)) # 積極分詞結果: [['#FollowFriday', '@France_Inte', '@PKuchly57', '@Milipol_Paris', 'for', 'being', 'top', 'engaged', 'members', 'in', 'my', 'community', 'this', 'week', ':)'], ['@Lamb2ja', 'Hey', 'James', '!', 'How', 'odd', ':/', 'Please', 'call', 'our', 'Contact', 'Centre', 'on', '02392441234', 'and', 'we', 'will', 'be', 'able', 'to', 'assist', 'you', ':)', 'Many', 'thanks', '!']] # 消極分詞結果: [['hopeless', 'for', 'tmr', ':('], ['Everything', 'in', 'the', 'kids', 'section', 'of', 'IKEA', 'is', 'so', 'cute', '.', 'Shame', "I'm", 'nearly', '19', 'in', '2', 'months', ':(']]
五、數據規范化
數據規范化包括以下步驟
- 詞性標注
- 垃圾數據處理
- 詞性還原
def cleaned_list_func(evert_tweet): """ 數據預處理 :param evert_tweet: 每條推文 / 每條待分析的英文句子 :return: 處理后的單詞, 一維列表 """ new_text = [] cixing_list = pos_tag(evert_tweet) # [('', 'NN'), ('', 'NNS'), ()] print('每條推文的詞性標注結果:{}'.format(cixing_list)) for word, cixing in cixing_list: word = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+#]|[!*\(\),]|(?:[0-9a-fA-F][0-9a-fA-F]))+', '', word) # 去掉網址的正則規則 word = re.sub('(@[A-Za-z0-9_]+)', '', word) # 去掉人民的規則, 帶有@的部分 if cixing.startswith('NN'): # 將標注的詞性進行判斷, 替換為英文的標准詞性表示 pos = 'n' elif cixing.startswith('VB'): pos = 'v' else: pos = 'a' lemmatizer = WordNetLemmatizer() # 使用WordNetLemmatizer類下的lemmatize方法進行詞性還原
new_word = lemmatizer.lemmatize(word, pos)
if len(new_word) > 0 and new_word not in string.punctuation and \ new_word.lower() not in stopwords.words('english'):
new_text.append(new_word.lower())
return new_text
# 數據規范化
positive_cleaned_list = []
negative_cleaned_list = []
for i in po_fenci_res:
positive_cleaned = cleaned_list_func(i)
positive_cleaned_list.append(positive_cleaned)
print('處理后的積極推文結果: {}'.format(positive_cleaned_list))
print('原積極數據對比: {}'.format(positive_tweets[:2]))
經過詞性標注的結果為
# 每條推文的詞性標注結果:[('#FollowFriday', 'JJ'), ('@France_Inte', 'NNP'), ('@PKuchly57', 'NNP'), ('@Milipol_Paris', 'NNP'), ('for', 'IN'), ('being', 'VBG'), ('top', 'JJ'), ('engaged', 'VBN'), ('members', 'NNS'), ('in', 'IN'), ('my', 'PRP$'), ('community', 'NN'), ('this', 'DT'), ('week', 'NN'), (':)', 'NN')]
數據處理后的推文及原數據的對比
# 處理后的積極推文結果: [['#followfriday', 'top', 'engage', 'member', 'community', 'week', ':)'], ['hey', 'james', 'odd', ':/', 'please', 'call', 'contact', 'centre', '02392441234', 'able', 'assist', ':)', 'many', 'thanks']] # 原積極數據對比: ['#FollowFriday @France_Inte @PKuchly57 @Milipol_Paris for being top engaged members in my community this week :)', '@Lamb2ja Hey James! How odd :/ Please call our Contact Centre on 02392441234 and we will be able to assist you :) Many thanks!']
六、構造模型數據
def get_tweets_for_model(clean_tokens_list, tag): """ 准備模型數據 :param clean_tokens_list: 處理后的推文 二維列表 :param tag: 標簽類別 :return: 一維列表, 元素是二元元組 """ li = [] for every_tweet in clean_tokens_list: data_dict = dict([token, True] for token in every_tweet) # {'':true,'':true} li.append((data_dict, tag)) return li
# 准備模型數據
po_for_model = get_tweets_for_model(positive_cleaned_list, 'Positive')
ne_for_model = get_tweets_for_model(negative_cleaned_list, 'Negative')
print('為模型准備的積極數據: {}'.format(po_for_model))
print('為模型准備的消極數據: {}'.format(ne_for_model))
此時數據結構為
# 為模型准備的消極數據: [({'hopeless': True, 'tmr': True, ':(': True}, 'Negative'), ({'everything': True, 'kid': True, 'section': True, 'ikea': True, 'cute': True, 'shame': True, "i'm": True, 'nearly': True, '19': True, '2': True, 'month': True, ':(': True}, 'Negative')]
七、准備訓練集和測試集
model_data = po_for_model + ne_for_model random.shuffle(model_data) # 打亂 train_data = model_data[:7000] # 前7000作為訓練集 test_data = model_data[7000:] # 其余作為測試機, 測試訓練出來的模型准確度
八、訓練和測試模型
def train_model(train_data, test_data): """ 訓練及測試模型 :param train_data: 訓練集 :param test_data: 測試集 :return: 訓練后的模型 """ from nltk import classify from nltk import NaiveBayesClassifier model = NaiveBayesClassifier.train(train_data) print('模型准確率為: {}'.format(classify.accuracy(model, test_data))) print(model.show_most_informative_features(10)) return model
# 訓練及測試模型
model = train_model(train_data, test_data)
九、使用模型預測數據
def test(model, test_text): """ 使用訓練好的模型預測數據 :param model: :param test_text: 待分析的句子 :return: """ from nltk.tokenize import word_tokenize custom_tokens = cleaned_list_func(word_tokenize(test_text)) result = dict([token, True] for token in custom_tokens) yuce_res = model.classify(result) print('內容: {} 預測結果: {}'.format(test_text, yuce_res))
test_list = [
"I was sad on the day you went away,I'm not the man your heart is missing,that's why you go away I know.",
"My heart is being cut by the knife that is called MISSING YOU. NOthing in the world can destroy me except losing you. My memory of you devours every cell of my blood",
"I will always be there for you.",
'I fuck you fuck your mother fuck your father fuck your family',
"Don't worry when you are not recognized, but strive to be worthy of recognition.",
"The power of imagination makes us infinite.",
"The glow of one warm thought is to me worth more than money."
]
for i in test_list:
test(model, i)
預測結果為
模型准確率為: 0.9943333333333333 Most Informative Features sad = True Negati : Positi = 35.1 : 1.0 follower = True Positi : Negati = 20.6 : 1.0 bam = True Positi : Negati = 20.1 : 1.0 arrive = True Positi : Negati = 18.6 : 1.0 x15 = True Negati : Positi = 17.3 : 1.0 blog = True Positi : Negati = 16.7 : 1.0 followed = True Negati : Positi = 15.5 : 1.0 damn = True Negati : Positi = 15.4 : 1.0 top = True Positi : Negati = 15.3 : 1.0 appreciate = True Positi : Negati = 13.9 : 1.0 None 內容: I was sad on the day you went away,I'm not the man your heart is missing,that's why you go away I know. 預測結果: Negative 內容: My heart is being cut by the knife that is called MISSING YOU. NOthing in the world can destroy me except losing you. My memory of you devours every cell of my blood 預測結果: Negative 內容: I will always be there for you. 預測結果: Negative 內容: I fuck you fuck your mother fuck your father fuck your family 預測結果: Negative 內容: Don't worry when you are not recognized, but strive to be worthy of recognition. 預測結果: Positive 內容: The power of imagination makes us infinite. 預測結果: Negative 內容: The glow of one warm thought is to me worth more than money. 預測結果: Positive
由於訓練集數據量有限, 所以預測結果也不一定完全准確
十、在此獻上全部代碼
import random import re import string from nltk.corpus import twitter_samples from nltk.tag import pos_tag, pos_tag_sents from nltk.stem.wordnet import WordNetLemmatizer from nltk.corpus import stopwords from nltk import FreqDist def fenci(file): return twitter_samples.tokenized(file) def cleaned_list_func(evert_tweet): new_text = [] cixing_list = pos_tag(evert_tweet) for word, cixing in cixing_list: word = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+#]|[!*\(\),]|(?:[0-9a-fA-F][0-9a-fA-F]))+', '', word) word = re.sub('(@[A-Za-z0-9_]+)', '', word) if cixing.startswith('NN'): pos = 'n' elif cixing.startswith('VB'): pos = 'v' else: pos = 'a' lemmatizer = WordNetLemmatizer() new_word = lemmatizer.lemmatize(word, pos) if len(new_word) > 0 and new_word not in string.punctuation and new_word.lower() not in stopwords.words('english'): new_text.append(new_word.lower()) return new_text def get_all_words(clean_tokens_list): for tokens in clean_tokens_list: for token in tokens: yield token def get_tweets_for_model(clean_tokens_list, tag): li = [] for every_tweet in clean_tokens_list: data_dict = dict([token, True] for token in every_tweet) li.append((data_dict, tag)) return li def train_model(train_data, test_data): from nltk import classify from nltk import NaiveBayesClassifier model = NaiveBayesClassifier.train(train_data)return model def test(model, test_text): from nltk.tokenize import word_tokenize custom_tokens = cleaned_list_func(word_tokenize(test_text)) result = dict([token, True] for token in custom_tokens) if __name__ == '__main__': po_file_path = 'positive_tweets.json' ne_file_path = 'negative_tweets.json' positive_tweets = twitter_samples.strings(po_file_path) negative_tweets = twitter_samples.strings(ne_file_path) po_fenci_res = fenci(po_file_path) be_fenci_res = fenci(ne_file_path) positive_cleaned_list = [] negative_cleaned_list = [] for i in po_fenci_res: positive_cleaned = cleaned_list_func(i) positive_cleaned_list.append(positive_cleaned) for j in be_fenci_res: negative_cleaned = cleaned_list_func(j) negative_cleaned_list.append(negative_cleaned) po_for_model = get_tweets_for_model(positive_cleaned_list, 'Positive') ne_for_model = get_tweets_for_model(negative_cleaned_list, 'Negative') model_data = po_for_model + ne_for_model random.shuffle(model_data) train_data = model_data[:7000] test_data = model_data[7000:] model = train_model(train_data, test_data) test_list = [ "I was sad on the day you went away,I'm not the man your heart is missing,that's why you go away I know.", "My heart is being cut by the knife that is called MISSING YOU. NOthing in the world can destroy me except losing you. My memory of you devours every cell of my blood", "I will always be there for you.", 'I fuck you fuck your mother fuck your father fuck your family', "Don't worry when you are not recognized, but strive to be worthy of recognition.", "The power of imagination makes us infinite.", "The glow of one warm thought is to me worth more than money." ] for i in test_list: test(model, i)
以上內容為簡單分析, 初學,有任何問題歡迎留言討論~~