引子
之前加個opencv的qq群,老是有消息提示,也不知道他們天天都在聊啥。於是乎,就想着用python分析看看他們都聊了什么主題。
實施步驟
說干就干,花了大概一小時,手動把聊天記錄粘出來(無奈qq不提供文本導出方式,即便導出bak文件,也都是加密的);
緊接着把聊天記錄規范化:
- 由於圖片粘不出來,有圖片的地方都是空,好在不影響文本分析;
- 然后刪除了一些打廣告的--什么中科院培訓啊、啥的;
最后規范之后格式為:
[ 日期,時間,用戶號, 聊天內容 ]
用戶號就是qq號,或者郵箱。
聊天內容為list,每個元素為本次發言打的一行字,可能有多行
這么存儲的目的是分析如下內容:
- 那些用戶發言多,發言的頻率是多少,時間曲線如何
- 看看他們都討論了什么內容--本次着重分析這個,采用@Font Tian大神寫的詞雲包
核心代碼
聊天內容提取
"""
Author: deepinwst
Email: wanshitao@donews.com
Date: 18-12-18 下午12:25
"""
import re
from copy import deepcopy
# 截取內容為2018-05-10到2018-12-18的聊天內容
FP = "OpenCv深度學習.txt"
# 匹配日期
date_re = re.compile(r"^\d{4}-\d{2}-\d{2}$")
# 匹配時間
time_re = re.compile(r"\d{1,2}:\d{2}:\d{2}$")
# 匹配用戶名
name_re_num = re.compile(r"\((\d+)\)")
name_re_mail = re.compile(r"\<(.*?@.*?)\>")
# 匹配用戶
person_re = re.compile(r"^\S+?\s\d{1,2}:\d{2}:\d{2}$")
def read_file(fp):
text = []
with open(FP) as content:
temp_record = ['', '', '', []]
temp_date = '2018-12-18'
for line in content:
sl = line.strip()
# print("sl:", sl)
# 判斷是否是日期
date_ = date_re.findall(sl)
if date_:
temp_date = date_[0]
text.append(temp_record.copy())
temp_record[3] = []
# print("date:", temp_date)
continue
# 判斷是否是用戶
person = person_re.findall(sl)
if person:
# print("person:", person[0])
# print("temp_record:", temp_record)
text.append(deepcopy(temp_record))
temp_record[3] = []
name = name_re_num.findall(sl) or name_re_mail.findall(sl)
# print("name:", name)
time_ = time_re.findall(sl)
temp_record[0] = temp_date
temp_record[1] = time_[0]
temp_record[2] = name[0]
else:
temp_record[3].append(sl)
return text
if __name__ == "__main__":
print("start ... ")
data = read_file(FP)
with open("meterials.txt", "w") as fp:
for ele in data:
print(ele[3])
fp.write(' '.join(ele[3]) + "\n")
print("*"*80)
詞雲生成
from os import path, getcwd
from scipy.misc import imread
import matplotlib.pyplot as plt
from wordcloud import WordCloud, ImageColorGenerator
d = getcwd()
stopwords_path = d + '/wc_cn/stopwords_cn_en.txt'
# Chinese fonts must be set
font_path = d + '/fonts/SourceHanSerif/SourceHanSerifK-Light.otf'
# the path to save worldcloud
imgname1 = d + '/wc_cn/LuXun.jpg'
imgname2 = d + '/wc_cn/LuXun_colored.jpg'
# read the mask / color image taken from
temp = path.join(d, d + '/wc_cn/LuXun_color.jpg')
back_coloring = imread(temp)
# Read the whole text.
text = open("meterials.txt").read()
# if you want use wordCloud,you need it add userdict
# If use HanLp,Maybe you don't need to use it
userdict_list = ['灰度值', '掩膜', '支持向量機', '人臉檢測']
isUseJieba = True
# use HanLP
# You can use the stop word feature to improve performance, or disable it to increase speed
isUseStopwordsByHanLP = False
# The function for processing text with Jieba
def jieba_processing_txt(text):
for word in userdict_list:
jieba.add_word(word)
mywordlist = []
seg_list = jieba.cut(text, cut_all=False)
liststr = "/ ".join(seg_list)
with open(stopwords_path, encoding='utf-8') as f_stop:
f_stop_text = f_stop.read()
f_stop_seg_list = f_stop_text.splitlines()
for myword in liststr.split('/'):
if not (myword.strip() in f_stop_seg_list) and len(myword.strip()) > 1:
mywordlist.append(myword)
return ' '.join(mywordlist)
result_text = ''
if isUseJieba:
import jieba
jieba.enable_parallel(4)
# Setting up parallel processes :4 ,but unable to run on Windows
# jieba.load_userdict("txt\userdict.txt")
# add userdict by load_userdict()
result_text = jieba_processing_txt(text)
wc = WordCloud(font_path=font_path, background_color="white", max_words=2000, mask=back_coloring,
max_font_size=100, random_state=42, width=1000, height=860, margin=2, )
wc.generate(result_text)
# create coloring from image
image_colors_default = ImageColorGenerator(back_coloring)
plt.figure()
# recolor wordcloud and show
plt.imshow(wc, interpolation="bilinear")
plt.axis("off")
plt.show()
# save wordcloud
wc.to_file(path.join(d, imgname1))
# create coloring from image
image_colors_byImg = ImageColorGenerator(back_coloring)
# show
# we could also give color_func=image_colors directly in the constructor
plt.imshow(wc.recolor(color_func=image_colors_byImg), interpolation="bilinear")
plt.axis("off")
plt.figure()
plt.imshow(back_coloring, interpolation="bilinear")
plt.axis("off")
plt.show()
# save wordcloud
wc.to_file(path.join(d, imgname2))
結果截圖:
結果分析:
- 由於是圖像處理的群,出現很多詞“圖像”、“圖片”、“opencv”
- 經常有新手問問題,“大神”、“大佬”出現了很多次
如果需要完整的源碼,請關注此公眾號,給我留言(qq聊天記錄分析);