引子

之前加个opencv的qq群，老是有消息提示，也不知道他们天天都在聊啥。于是乎，就想着用python分析看看他们都聊了什么主题。

实施步骤

说干就干，花了大概一小时，手动把聊天记录粘出来（无奈qq不提供文本导出方式，即便导出bak文件，也都是加密的）；

紧接着把聊天记录规范化：

由于图片粘不出来，有图片的地方都是空，好在不影响文本分析；
然后删除了一些打广告的--什么中科院培训啊、啥的；

最后规范之后格式为：

[ 日期，时间，用户号， 聊天内容 ]

用户号就是qq号，或者邮箱。
聊天内容为list，每个元素为本次发言打的一行字，可能有多行

这么存储的目的是分析如下内容：

那些用户发言多，发言的频率是多少，时间曲线如何
看看他们都讨论了什么内容--本次着重分析这个，采用@Font Tian大神写的词云包

核心代码

聊天内容提取

"""
Author: deepinwst
Email: wanshitao@donews.com
Date: 18-12-18 下午12:25
"""


import re
from copy import deepcopy


# 截取内容为2018-05-10到2018-12-18的聊天内容
FP = "OpenCv深度学习.txt"

# 匹配日期
date_re = re.compile(r"^\d{4}-\d{2}-\d{2}$")
# 匹配时间
time_re = re.compile(r"\d{1,2}:\d{2}:\d{2}$")
# 匹配用户名
name_re_num = re.compile(r"\((\d+)\)")
name_re_mail = re.compile(r"\<(.*?@.*?)\>")
# 匹配用户
person_re = re.compile(r"^\S+?\s\d{1,2}:\d{2}:\d{2}$")


def read_file(fp):
    text = []
    with open(FP) as content:
        temp_record = ['', '', '', []]
        temp_date = '2018-12-18'

        for line in content:
            sl = line.strip()
            # print("sl:", sl)
            # 判断是否是日期
            date_ = date_re.findall(sl)
            if date_:
                temp_date = date_[0]
                text.append(temp_record.copy())
                temp_record[3] = []
                # print("date:", temp_date)
                continue

            # 判断是否是用户
            person = person_re.findall(sl)
            if person:
                # print("person:", person[0])
                # print("temp_record:", temp_record)
                text.append(deepcopy(temp_record))
                temp_record[3] = []
                name = name_re_num.findall(sl) or name_re_mail.findall(sl)
                # print("name:", name)
                time_ = time_re.findall(sl)
                temp_record[0] = temp_date
                temp_record[1] = time_[0]
                temp_record[2] = name[0]
            else:
                temp_record[3].append(sl)

    return text


if __name__ == "__main__":
    print("start ... ")
    data = read_file(FP)
    with open("meterials.txt", "w") as fp:
        for ele in data:
            print(ele[3])
            fp.write(' '.join(ele[3]) + "\n")
    print("*"*80)

词云生成

from os import path, getcwd
from scipy.misc import imread
import matplotlib.pyplot as plt
from wordcloud import WordCloud, ImageColorGenerator


d = getcwd()
stopwords_path = d + '/wc_cn/stopwords_cn_en.txt'
# Chinese fonts must be set
font_path = d + '/fonts/SourceHanSerif/SourceHanSerifK-Light.otf'

# the path to save worldcloud
imgname1 = d + '/wc_cn/LuXun.jpg'
imgname2 = d + '/wc_cn/LuXun_colored.jpg'
# read the mask / color image taken from
temp = path.join(d, d + '/wc_cn/LuXun_color.jpg')

back_coloring = imread(temp)

# Read the whole text.
text = open("meterials.txt").read()

# if you want use wordCloud,you need it add userdict
# If use HanLp,Maybe you don't need to use it
userdict_list = ['灰度值', '掩膜', '支持向量机', '人脸检测']

isUseJieba = True

# use HanLP
# You can use the stop word feature to improve performance, or disable it to increase speed
isUseStopwordsByHanLP = False


# The function for processing text with Jieba
def jieba_processing_txt(text):
    for word in userdict_list:
        jieba.add_word(word)

    mywordlist = []
    seg_list = jieba.cut(text, cut_all=False)
    liststr = "/ ".join(seg_list)

    with open(stopwords_path, encoding='utf-8') as f_stop:
        f_stop_text = f_stop.read()
        f_stop_seg_list = f_stop_text.splitlines()

    for myword in liststr.split('/'):
        if not (myword.strip() in f_stop_seg_list) and len(myword.strip()) > 1:
            mywordlist.append(myword)
    return ' '.join(mywordlist)


result_text = ''


if isUseJieba:
    import jieba

    jieba.enable_parallel(4)
    # Setting up parallel processes :4 ,but unable to run on Windows
    # jieba.load_userdict("txt\userdict.txt")
    # add userdict by load_userdict()
    result_text = jieba_processing_txt(text)


wc = WordCloud(font_path=font_path, background_color="white", max_words=2000, mask=back_coloring,
               max_font_size=100, random_state=42, width=1000, height=860, margin=2, )

wc.generate(result_text)

# create coloring from image
image_colors_default = ImageColorGenerator(back_coloring)

plt.figure()
# recolor wordcloud and show
plt.imshow(wc, interpolation="bilinear")
plt.axis("off")
plt.show()

# save wordcloud
wc.to_file(path.join(d, imgname1))

# create coloring from image
image_colors_byImg = ImageColorGenerator(back_coloring)

# show
# we could also give color_func=image_colors directly in the constructor
plt.imshow(wc.recolor(color_func=image_colors_byImg), interpolation="bilinear")
plt.axis("off")
plt.figure()
plt.imshow(back_coloring, interpolation="bilinear")
plt.axis("off")
plt.show()

# save wordcloud
wc.to_file(path.join(d, imgname2))

结果截图：

结果分析：

由于是图像处理的群，出现很多词“图像”、“图片”、“opencv”
经常有新手问问题，“大神”、“大佬”出现了很多次

如果需要完整的源码，请关注此公众号，给我留言（qq聊天记录分析）；

免责声明！

本站转载的文章为个人学习借鉴使用，本站对版权不负任何法律责任。如果侵犯了您的隐私权益，请联系本站邮箱yoyou2525@163.com删除。

猜您在找 用QQ聊天记录生成一个词云用python做些有意思的事——分析QQ聊天记录——私人订制用python做些有意思的事——分析QQ聊天记录导出QQ聊天记录并用python处理 QQ聊天记录的相关代码 Python 情人节超强技能导出微信聊天记录生成词云 QQ聊天记录快速迁移如何找回QQ聊天记录、语音、图片？手机qq 聊天记录同步到电脑qq上 Mac WIn7 QQ聊天记录互导聊天记录合并