1.jieba分詞與詞性標注
思路:
(1)利用pandas讀取csv文件中的酒店客戶評論,並創建3個新列用來存放分詞結果、詞性標注結果、分詞+詞性標注結果
(2)利用jieba分詞工具的posseg包,同時實現分詞與詞性標注
(3)利用停用詞表對分詞結果進行過濾
(4)將分詞結果以20000條為單位寫入txt文檔中,便於后續的詞頻統計以詞雲的制作
(5)將最終的分詞結果與詞性標注結果存儲到csv文件中
# coding:utf-8 import pandas as pd import jieba.posseg as pseg import jieba import time from jieba.analyse import * df=pd.read_csv('csvfiles/hotelreviews_after_filter_utf.csv',header=None) #hotelreviews50_1.csv文件與.py文件在同一級目錄下 #在讀數之后自定義標題 columns_name=['mysql_id','hotelname','customername','reviewtime','checktime','reviews','scores','type','room','useful','likenumber'] df.columns=columns_name df['review_split']='new' #創建分詞結果列:review_split df['review_pos']='new' #創建詞性標注列:review_pos df['review_split_pos']='new' #創建分詞結果/詞性標注列:review_split_pos # 調用jieba分詞包進行分詞 def jieba_cut(review): review_dict = dict(pseg.cut(review)) return review_dict # 創建停用詞列表 def stopwordslist(stopwords_path): stopwords = [line.strip() for line in open(stopwords_path,encoding='UTF-8').readlines()] return stopwords # 獲取分詞結果、詞性標注結果、分詞結果/分詞標注結果的字符串 def get_fenciresult_cixin(review_dict_afterfilter): keys = list(review_dict_afterfilter.keys()) #獲取字典中的key values = list(review_dict_afterfilter.values()) review_split="/".join(keys) review_pos="/".join(values) review_split_pos_list = [] for j in range(0,len(keys)): review_split_pos_list.append(keys[j]+"/"+values[j]) review_split_pos=",".join(review_split_pos_list) return review_split,review_pos,review_split_pos stopwordslist=stopwordslist("stopwords_txt/total_stopwords_after_filter.txt") # review="剛剛才離開酒店,這是一次非常愉快滿意住宿體驗。酒店地理位置對游客來說相當好,離西湖不行不到十分鍾,離地鐵口就幾百米,周圍是繁華商業中心,吃飯非常方便。酒店外觀雖然有些年頭,但里面裝修一點不過時,我是一個對衛生要求高的,對比很滿意,屋里有消毒櫃可以消毒杯子,每天都有送兩個蘋果。三樓還有自助洗衣,住客是免費的,一切都干干凈凈,服務也很貼心,在這寒冷的冬天,住這里很溫暖很溫馨" #分詞與詞性標注 def fenci_and_pos(review): #01 調用jieba的pseg同時進行分詞與詞性標注,返回一個字典 d = {key1 : value1, key2 : value2 } review_dict= jieba_cut(review) # print(review_dict) # 02 停用詞過濾 review_dict_afterfilter = {} for key, value in review_dict.items(): if key not in stopwordslist: review_dict_afterfilter[key] = value else: pass # print(review_dict_afterfilter) #03 獲取分詞結果、詞性標注結果、分詞+詞性結果 review_split, review_pos,review_split_pos = get_fenciresult_cixin(review_dict_afterfilter) return review_split,review_pos,review_split_pos def fenci_pos_time(start_time, end_time): elapsed_time = end_time - start_time elapsed_mins = int(elapsed_time / 60) elapsed_secs = int(elapsed_time - (elapsed_mins * 60)) return elapsed_mins, elapsed_secs # fenci_and_pos(review) # jieba.load_userdict('stopwords_txt/user_dict.txt') #使用用戶自定義的詞典 start_time = time.time() review_count=0 txt_id = 1 for index,row in df.iterrows(): reviews=row['reviews'] review_split, review_pos, review_split_pos=fenci_and_pos(reviews) # print(review_split) # print(review_pos) # print(review_split_pos) review_mysql_id=row['mysql_id'] print(review_mysql_id) #輸出當前分詞的評論ID df.loc[index,'review_split']=review_split df.loc[index,'review_pos']=review_pos df.loc[index,'review_split_pos']=review_split_pos #review_split 將分詞結果逐行寫入txt文檔中 if review_count<20000: review_count+=1 #計數+1 review_split_txt_path = 'split_result_txt/split_txt_' + str(txt_id) + '.txt' f = open(review_split_txt_path, 'a', encoding='utf-8') f.write('\n' + review_split) f.close() else: txt_id+=1 review_count=0 review_split_txt_path = 'split_result_txt/split_txt_' + str(txt_id) + '.txt' f = open(review_split_txt_path, 'a', encoding='utf-8') f.write('\n' + review_split) f.close() df.to_csv('csvfiles/hotelreviews_fenci_pos.csv', header=None, index=False) # header=None指不把列號寫入csv當中 # 計算分詞與詞性標注所用時間 end_time = time.time() fenci_mins, fenci_secs = fenci_pos_time(start_time, end_time) print(f'Fenci Time: {fenci_mins}m {fenci_secs}s') print("hotelreviews_fenci_pos.csv文件分詞與詞性標注已完成")
2.詞頻統計
#詞頻統計函數 def wordfreqcount(review_split_txt_path): wordfreq = {} # 詞頻字典 f = open(review_split_txt_path, 'r', encoding='utf-8') #打開分詞結果的txt文件 review_split = "" #逐行讀取文件,將讀取的字符串用/切分,遍歷切分結果,統計詞頻 for line in f.readlines(): review_words = line.split("/") keys = list(wordfreq.keys()) for word in review_words: if word in keys: wordfreq[word] = wordfreq[word] + 1 else: wordfreq[word] = 1 word_freq_list = list(wordfreq.items()) word_freq_list.sort(key=lambda x: x[1], reverse=True) return word_freq_list #設置分詞結果保存的txt路徑 txt_id = 1 review_split_txt_path = 'split_result_txt/split_txt_' + str(txt_id) + '.txt' word_freq_list=wordfreqcount(review_split_txt_path) #輸出詞頻前10的詞匯及其出現頻次 for i in range(10): print(word_freq_list[i])
3.詞雲制作
首先利用conda安裝wordcloud
conda install -c conda-forge wordcloud
最簡單的入門案例:
import wordcloud # 構建詞雲對象w,設置詞雲圖片寬、高、字體、背景顏色等參數 w = wordcloud.WordCloud(width=1000,height=700,background_color='white',font_path='msyh.ttc') # 調用詞雲對象的generate方法,將文本傳入 w.generate('從明天起,做一個幸福的人。喂馬、劈柴,周游世界。從明天起,關心糧食和蔬菜。我有一所房子,面朝大海,春暖花開') # 將生成的詞雲保存為output2-poem.png圖片文件,保存到當前文件夾中 w.to_file('output2-poem.png')
效果圖:
我的詞雲案例:
import jieba import wordcloud # 導入imageio庫中的imread函數,並用這個函數讀取本地圖片,作為詞雲形狀圖片 import imageio mk = imageio.imread("pic/qiqiu2.png") # 構建並配置詞雲對象w w = wordcloud.WordCloud( max_words=200, # 詞雲顯示的最大詞數 background_color='white', mask=mk, font_path='msyh.ttc', #字體路徑,文件中沒有(應該是無效設置) ) #設置分詞結果保存的txt路徑 txt_id = 1 review_split_txt_path = 'split_result_txt/split_txt_' + str(txt_id) + '.txt' f = open(review_split_txt_path, 'r', encoding='utf-8') string="" for line in f.readlines(): string+=line print(string) # 將string變量傳入w的generate()方法,給詞雲輸入文字 w.generate(string) # 將詞雲圖片導出到當前文件夾 w.to_file('output5-tongji.png')
效果圖:
參考文獻:https://www.cnblogs.com/wkfvawl/p/11585986.html
4.TF-IDF 關鍵詞提取
import jieba txt_id=1 review_split_txt_path='split_result_txt/split_txt_'+str(txt_id)+'.txt' f = open(review_split_txt_path, 'r',encoding='utf-8') review_split="" for line in f.readlines(): review_split+=line print("review_split:"+review_split) # test_reviews="剛剛才離開酒店,這是一次非常愉快滿意住宿體驗。" # review_split, review_pos, review_split_pos=fenci_and_pos(test_reviews) # print(review_split) keywords = jieba.analyse.extract_tags(review_split,topK = 10, withWeight = True) print('【TF-IDF提取的關鍵詞列表:】') print(keywords) #采用默認idf文件提取的關鍵詞
參考文獻:https://blog.csdn.net/asialee_bird/article/details/81486700 TF-IDF算法介紹及實現