本預選賽要求選手建立文本情感分類模型，選手用訓練好的模型對測試集中的文本情感進行預測，判斷其情感為「Negative」或者「Positive」。所提交的結果按照指定的評價指標使用在線評測數據進行評測，達到或超過規定的分數線即通過預選賽。

二、比賽數據

訓練集數據：(6328個樣本）

測試集數據（2712個樣本）

評價方法：AUC

三、分析

1、加載模塊

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.feature_extraction.text import TfidfTransformer,TfidfVectorizer

2、數據讀取

train_data = pd.read_csv('./data/train.csv',engine = 'python')
test_data = pd.read_csv('./data/20190520_test.csv')

3、數據預處理

test_data.info() ###可以看到有5個缺失值。

###缺失值處理

train_data = train_data.dropna (axis=0,subset = ['label'])

###標簽數據處理，label轉成（0,1）

train_data['label'] = train_data['label'].replace(to_replace=['Positive', 'Negative'], value=[0, 1])

###評論數據處理

    1、去除一些【標點符號、數字、特殊符號、等】
    2、分詞，去除句子的空格前綴(strip)，單詞最小化（lower)
    3、去除一些【短詞】和【停用詞】，大多數太短的詞起不到什么作用，比如‘pdx’，‘his’，‘all’。
    4、提取詞干，將不同但同義的詞轉化成相同的詞，如loves，loving，lovable變成love

###評論數據處理
def filter_fun(line):
    #表示將data中的除了大小寫字母之外的符號換成空格,去除一些標點符號、特征符號、數字等
    line = re.sub(r'[^a-zA-Z]',' ',line)
    ##單詞小寫化
    line = line.lower()
    return line

train_data['review'] = train_data['review'].apply(filter_fun)
test_data['review'] = test_data['review'].apply(filter_fun)
##把空格前綴去除
train_data['review'] = train_data['review'].str.strip()  
test_data['review'] = test_data['review'].str.strip()

##刪除短單詞
train_data['review'] = train_data['review'].apply(lambda x:' '.join([w for w in x.split() if len(w) > 3]))
test_data['review'] = test_data['review'].apply(lambda x:' '.join([w for w in x.split() if len(w) > 3]))

##分詞
train_data['review'] = train_data['review'].str.split()
test_data['review'] = test_data['review'].str.split()

##提取詞干，即基於規則從單詞中去除后綴的過程。例如，play，player，played，plays，playing都是play的變種。
from nltk.stem.porter import *
stemmer =PorterStemmer()
train_data['review'] = train_data['review'].apply(lambda x: [stemmer.stem(i) for i in x])
test_data['review'] = test_data['review'].apply(lambda x: [stemmer.stem(i) for i in x])

train_data['review'] = train_data['review'].apply(lambda x:" ".join(x))
test_data['review'] = test_data['review'].apply(lambda x:" ".join(x))

########################以下部分可以不處理##################

3、數據分析

在這次比賽中數據分析沒起什么作用，因為評論做了脫敏處理。

####################################
    1、數據集中最常見的單詞有哪些？【可采用詞雲】
    2、數據集上表述積極和消極的常見詞匯有哪些？【可采用詞雲】
    3、評論一般有多少主題標簽？
    4、我的數據集跟哪些趨勢相關？
    5、哪些趨勢跟情緒相關？他們和情緒是吻合的嗎？
　　 6、詞長與頻次的關系【畫柱狀圖，此次代碼中平均詞長為15】

#使用 詞雲 來了解評論中最常用的詞匯
all_words = ' '.join([text for text in combi['review']])
from wordcloud import WordCloud
wordcloud = WordCloud(width=800, height=500, random_state=21, max_font_size=110).generate(all_words) 
plt.figure(figsize=(10, 7))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis('off')
plt.show()

# 積極數據
positive_words =' '.join([text for text in combi['review'][combi['label'] == 0]]) 
wordcloud = WordCloud(width=800, height=500, random_state=21, max_font_size=110).generate(positive_words)
plt.figure(figsize=(10, 7))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis('off')
plt.show()

# 消極數據
negative_words = ' '.join([text for text in combi['review'][combi['label'] == 1]]) 
wordcloud = WordCloud(width=800, height=500,random_state=21, max_font_size=110).generate(negative_words)
plt.figure(figsize=(10, 7))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis('off')
plt.show()

def hashtag_extract(x):    
    hashtags = []    # Loop over the words in the tweet    
    for i in x:        
        ht = re.findall(r"#(\w+)", i)        
        hashtags.append(ht)     
    return hashtags


# extracting hashtags from non racist/sexist tweets
HT_positive = hashtag_extract(combi['review'][combi['label'] == 0]) 
# extracting hashtags from racist/sexist tweets
HT_negative = hashtag_extract(combi['review'][combi['label'] == 1]) 
# unnesting list
HT_positive = sum(HT_positive,[])
HT_negative = sum(HT_negative,[])

# 畫積極標簽
a = nltk.FreqDist(HT_positive)
d = pd.DataFrame({'Hashtag': list(a.keys()),'Count': list(a.values())})
# selecting top 10 most frequent hashtags     
d = d.nlargest(columns="Count", n = 10)     
#前十
plt.figure(figsize=(16,5))
ax = sns.barplot(data=d, x= "Hashtag", y = "Count")
ax.set(ylabel = 'Count')
plt.show()

# 畫消極標簽
b = nltk.FreqDist(HT_negative)
e = pd.DataFrame({'Hashtag': list(b.keys()),'Count': list(b.values())})
# selecting top 10 most frequent 
hashtagse = e.nlargest(columns="Count", n = 10)   
plt.figure(figsize=(16,5))
ax = sns.barplot(data=e, x= "Hashtag", y = "Count")
ax.set(ylabel = 'Count')
plt.show()

##數據分析
def split_word(s):
    return len(s.split())
train_data['word_length'] = train_data['review'].apply(split_word)
sum_len , nums ,maxnum , minnum = sum(train_data['word_length']) , len(train_data['word_length']) , max(train_data['word_length']) ,min(train_data['word_length'])
print("all words number: {0} , mean word length : {1} ,max word length: {2} and min: {3} ".format(sum_len , sum_len // nums , maxnum,minnum))


##評論詞長---頻次
plt.xlabel('length')
plt.ylabel('frequency')
plt.hist(train_data['word_length'],bins = 150)
plt.axis([0,100,0,800])
plt.show()

#######################################################3

4、模型

　　訓練模型----兩部分（文本特征提取、文本分類）

    1、文本特征提取：詞袋模型、TF_IDF、word_embbeding
    2、文本分類：邏輯回歸、SVM、貝葉斯、LSTM、textCNN等

    【在這次預選賽中，效果最好的是 TF_IDF + 貝葉斯 ----0.86】
    【試了 詞袋模型 + LR 和 TF_IDF + LR（這兩種效果最差）、詞袋模型 + 貝葉斯 （效果一般）----0.84、TF_IDF + 貝葉斯（效果最好）】

（1）文本特征提取：

　　①詞袋模型

#構建詞袋模型
from sklearn.feature_extraction.text import CountVectorizer
bow_vectorizer = CountVectorizer(max_df=0.30, max_features=8200, stop_words='english') 
X_train = bow_vectorizer.fit_transform(train_data['review'])
X_test = bow_vectorizer.fit_transform(test_data['review'])
print(test_data.describe()) 
print(X_train.toarray())

　　②TF-IDF模型

#####TF-IDF模型
ngram = 2
vectorizer = TfidfVectorizer(sublinear_tf=True,ngram_range=(1, ngram), max_df=0.5)

X_all = train_data['review'].values.tolist() + test_data['review'].values.tolist() # Combine both to fit the TFIDF vectorization.
lentrain = len(train_data)

vectorizer.fit(X_all)
X_all = vectorizer.transform(X_all)

X_train = X_all[:lentrain] # Separate back into training and test sets. 
X_test = X_all[lentrain:]

（2）文本分類模型

　　①邏輯回歸

# 邏輯回歸構建模型 

#切分訓練集和測試集
# xtrain_bow, xvalid_bow, ytrain, yvalid = train_test_split(X_train, train_data['label'], random_state=42, test_size=0.3)
X_train_part,X_test_part,y_train_part,y_test_part = train_test_split(X_train,train_data['label'],test_size = 0.2)
# 使用詞袋模型特征集合構建模型

lreg = LogisticRegression()
lreg.fit(X_train_part, y_train_part)  
prediction = lreg.predict_proba(X_test_part) 
# predicting on the validation set
prediction_int = prediction[:,1] >= 0.3
prediction_int = prediction_int.astype(np.int)
print("回歸f",f1_score(y_test_part, prediction_int)) # calculating f1 score
fpr, tpr, thresholds = roc_curve(y_test_part, prediction_int)
print('回歸auc',auc(fpr, tpr))

test_pred = lreg.predict_proba(X_test)

print("這里P:",test_pred)

保存測試結果

print(test_pred.size)
test_pred_int = test_pred[:,1]    #提取我們需要預測的test的label列
print(test_pred_int.size)    #看看進過模型預測后的長度是否有變化 
print(pd.DataFrame(test_data,columns=["ID"]).size)     #看看原始test的數據列有多少  

test_data['Pred'] = test_pred_int
submission = test_data[['ID','Pred']]
submission.to_csv('./result.csv', index=False) # writing data to a CSV file

　　②貝葉斯模型

from sklearn.model_selection import  train_test_split,KFold
#from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB,BernoulliNB,MultinomialNB
from sklearn.metrics import roc_auc_score,auc,roc_curve
#from sklearn.svm import SVC
#import xgboost as xgb

X_train_part,X_test_part,y_train_part,y_test_part = train_test_split(X_train,train_data['label'],test_size = 0.2)

clf = MultinomialNB()
clf.fit(X_train_part,y_train_part)

y_pred = clf.predict_proba(X_train_part)
fpr, tpr, thresholds = roc_curve(y_train_part, y_pred[:,1])
auc(fpr, tpr)  ###0.9992496306904572

y_pred = clf.predict_proba(X_test_part)
fpr, tpr, thresholds = roc_curve(y_test_part, y_pred[:,1])
auc(fpr, tpr) ###0.8613719824212871

clf = MultinomialNB() clf.fit(X_train,train_data['label']) y_pred_text = clf.predict_proba(X_test) ##保存測試結果 submit = pd.DataFrame() submit['ID'] = test['ID'] submit['Pred'] = y_pred_text[:,1] submit.to_csv('submit_bayes_2.csv',index=False)

參考：https://blog.csdn.net/Strawberry_595/article/details/90205761

kesci---2019大數據挑戰賽預選賽---情感分析

一、預選賽題------文本情感分類模型