Python大作業---微博爬蟲及簡單數據分析

本文轉載自查看原文 2019-07-17 04:34 957

剛開始學python，選了這個題目，把代碼放上來留念，沒有用到很流行的框架，所以代碼量挺大
GUI用wxpython寫的

# _*_ coding: UTF-8 _*_
import os
import re
import requests
import sys
import wx
import traceback
from datetime import datetime
from datetime import timedelta
from lxml import etree
import data_analysis

global file_path
file_path = ''
class Wb(wx.App):
def Operate(self):
self.cookie = {}
self.username = '' # 用戶名，如“Dear-迪麗熱巴”
self.Number = 0 # 用戶全部微博數
self.number1 = 0 # 爬取到的微博數
self.Guanzhu = 0
self.fans = 0
self.Content = [] # 微博內容
self.Time = [] # 微博發布時間
self.star = [] # 微博對應的點贊數
self.Zhuanfa = [] # 微博對應的轉發數
self.Pinglun = [] # 微博對應的評論數
self.publish_tool = []
self.Id = 0000
###======================================================================================================
###======================================GUI=============================================================

# 建立一個窗口和frame控件
self.frame_operate = wx.Frame(
None, title="Weibo_Spider_GUI", size=(500, 500))
self.panel_operate = wx.Panel(self.frame_operate, -1)

# 設置字體格式
self.font1 = wx.Font(18, wx.ROMAN, wx.ITALIC, wx.NORMAL)
self.label1 = wx.StaticText(
self.panel_operate, -1, "WeiBo Spider", pos=(180, 60), style=wx.ALIGN_CENTER)
self.label1.SetFont(self.font1)

# cookie的標簽和文本框
self.label2 = wx.StaticText(
self.panel_operate, -1, "請輸入您微博登陸的有效cookie", pos=(160, 130), style=wx.ALIGN_CENTER)
self.textCookie = wx.TextCtrl(
self.panel_operate, -1, pos=(200, 150), size=(80, 20), style=wx.TE_CENTER)

# 獲取所爬取用戶的self.Id
self.label3 = wx.StaticText(
self.panel_operate, -1, "請輸入您所要爬取微博賬號的self.Id", pos=(160, 180), style=wx.ALIGN_CENTER)
self.textId = wx.TextCtrl(
self.panel_operate, -1, pos=(200, 200), size=(80, 20), style=wx.TE_CENTER)

# 文件存儲路徑
self.label4 = wx.StaticText(self.panel_operate,-1,"數據文件保存路徑", pos=(160,230),style=wx.ALIGN_CENTER)
self.textFile_path = wx.TextCtrl(self.panel_operate,-1,pos=(200,250),size=(80,20),style=wx.TE_CENTER)

# 設置開始爬蟲按鈕
self.button_start = wx.Button(self.panel_operate, -1, "開始爬取微博信息", pos=(200, 350))
# 綁定響應事件
self.Bind(wx.EVT_BUTTON, self.get_cookie, self.button_start)
self.frame_operate.Show()

# 微博的正式UI界面-----------------------------------------------------------------------------------
# 獲取用戶輸入的參數值

#注意getvalue不能和用戶輸入放在一個函數里，要分開寫，而且，獲取不同的值，也要放在不同函數里！！

def get_cookie(self,event):
self.cookie = {"Cookie": self.textCookie.GetValue()}
self.Id=int(self.textId.GetValue())
global file_path
file_path = self.textFile_path.GetValue()+os.sep+"%d" % self.Id + ".txt"
self.Onbutton_Start()

def Onbutton_Start(self):
self.GetName() #獲取用戶名
self.GetSimple_Info() # 獲取微博數，轉發量，關注數，粉絲數
self.weibo_para()
self.write_txt()
self.weibo_UI1()

def weibo_UI1(self):
# 建立新的窗口，展示用戶的信息
# 彈出文本框：數據讀取完畢
message = "文件爬取完畢"
wx.MessageBox(message)
self.weibo_UI2()

def weibo_UI2(self):
self.frame_operate.Destroy()
self.frame_Info = wx.Frame(None,title="User_Information",size=(500,500))
self.panel_Info = wx.Panel(self.frame_Info,-1)
t1 = "用戶昵稱：" + str(self.username)
t2 = "微博數:" + str(self.Number)
t3 = "粉絲數:"+str(self.fans)
t4 = "關注數："+str(self.Guanzhu)
self.label16 = wx.StaticText(self.panel_Info,-1,self.username,pos=(200,100),style=wx.ALIGN_LEFT)
self.label5 = wx.StaticText(self.panel_Info,-1,t1,pos=(180,130),style=wx.ALIGN_LEFT)
self.label13 = wx.StaticText(self.panel_Info,-1,t2,pos=(180,150),style=wx.ALIGN_LEFT)
self.label14 = wx.StaticText(self.panel_Info,-1,t3,pos=(180,170),style=wx.ALIGN_LEFT)
self.label15 = wx.StaticText(self.panel_Info,-1,t4,pos=(180,190),style=wx.ALIGN_LEFT)
self.font2 = wx.Font(13,wx.SCRIPT,wx.ITALIC,wx.NORMAL) #小字體 font1大字體
self.label16.SetFont(self.font1)
self.label5.SetFont(self.font2)
self.label13.SetFont(self.font2)
self.label14.SetFont(self.font2)
self.label15.SetFont(self.font2)

self.button_news = wx.Button(self.panel_Info,-1,"查看最近微博",pos=(220,280))
self.Bind(wx.EVT_BUTTON,self.weibo_UI3 ,self.button_news)
self.frame_Info.Show()

# 最進微博
def weibo_UI3(self,event):
self.frame_Info.Destroy()
self.frame_news = wx.Frame(None,title="---",size=(500,500))
self.panel_news = wx.Panel(self.frame_news,-1)
label18 = wx.StaticText(self.panel_news,-1,"最新微博動態",pos=(200,40))
if self.Content:
text1 = "最新/置頂微博為: " + self.Content[0]
text2 = "最新/置頂微博發布工具: " + self.publish_tool[0]
text3 = "最新/置頂微博發布時間: " + self.Time[0]
text4 = "最新/置頂微博獲得贊數: " + str(self.star[0])
text5 = "最新/置頂微博獲得轉發數: " + str(self.Zhuanfa[0])
text6 = "最新/置頂微博獲得評論數: " + str(self.Pinglun[0])

self.label6 = wx.TextCtrl(self.panel_news,-1,text1,pos=(90,60),size=(250,140), style=wx.TE_MULTILINE|wx.TE_RICH)
self.label7 = wx.StaticText(self.panel_news,-1,text2,pos=(90,200),style=wx.ALIGN_LEFT)
self.label8 = wx.StaticText(self.panel_news,-1,text3,pos=(90,220),style=wx.ALIGN_LEFT)
self.label9 = wx.StaticText(self.panel_news,-1,text4,pos=(90,240),style=wx.ALIGN_LEFT)
self.label10 = wx.StaticText(self.panel_news,-1,text5,pos=(90,260),style=wx.ALIGN_LEFT)
self.label11 = wx.StaticText(self.panel_news,-1,text6,pos=(90,280),style=wx.ALIGN_LEFT)

# 查看微博信息
self.Button_info = wx.Button(self.panel_news,-1,"點擊查看之前的微博內容",pos=(220,340))
self.Bind(wx.EVT_BUTTON,self.weibo_pre_info,self.Button_info)
# 查看爬蟲信息的文檔
self.Button_file = wx.Button(self.panel_news,-1,"點擊查看微博數據分析圖表",pos=(220,380))
self.Bind(wx.EVT_BUTTON,self.analysis_UI,self.Button_file)
self.frame_news.Show()

def analysis_UI(self,event):
self.frame_data = wx.Frame(None,title="data_analysis--20177830115",size=(500,500))
self.panel_data = wx.Panel(self.frame_data,-1)
text1 = "2017-2018微博轉發/點贊量折線統計圖"
text2 = '原創微博與轉發微博統計圖'
text3 = '微博發布工具統計圖'
text4 = '微博使用心情統計圖'
self.button_1 = wx.Button(self.panel_data,-1,text1,pos=(180,120))
self.button_2 = wx.Button(self.panel_data,-1,text2,pos=(180,160))
self.button_3 = wx.Button(self.panel_data,-1,text3,pos=(180,200))
self.button_4 = wx.Button(self.panel_data,-1,text4,pos=(180,240))
self.Bind(wx.EVT_BUTTON,self.figure_1,self.button_1)
self.Bind(wx.EVT_BUTTON,self.figure_2,self.button_2)
self.Bind(wx.EVT_BUTTON,self.figure_3,self.button_3)
self.Bind(wx.EVT_BUTTON,self.figure_4,self.button_4)
self.frame_data.Show()

def figure_1(self,event):
global file_path
figure = data_analysis.analysis(file_path,self.Number)
figure.analyse_Zhexian()

def figure_2(self,event):
global file_path
figure = data_analysis.analysis(file_path,self.Number)
figure.analyse_YC()

def figure_3(self,event):
global file_path
figure = data_analysis.analysis(file_path,self.Number)
figure.analyse_GJ()

def figure_4(self,event):
global file_path
figure = data_analysis.analysis(file_path,self.Number)
figure.analyse_XQ()

def weibo_pre_info(self,event): ## 過度函數，為了讓不斷進入weibo_info函數中（分條輸出）不報錯。（多次進入沒有event觸發）
self.weibo_info()

def weibo_info(self):
#flag = 1#計次函數，flag==1，繼續循環，flag==0退出循環，即不展示下一條微博 ## 這坑爹玩意根本不能用for循環，所以我只能不斷進入函數
self.s = wx.Frame(None,title="---",size=(500,500))
self.f = wx.Panel(self.s,-1)
#for i in range(1,self.Number+1):
text1 = str(self.a+1)+":" + self.Content[self.a]
text2 = "發布工具: " + self.publish_tool[self.a]
text3 = "發布時間: " + self.Time[self.a]
text4 = "點贊數: " + str(self.star[self.a])
text5 = "轉發數: " + str(self.Zhuanfa[self.a])
text6 = "評論數: " + str(self.Pinglun[self.a])

self.labela = wx.TextCtrl (self.f,-1,text1,pos=(80, 60),size=(250,140),style=wx.TE_MULTILINE|wx.TE_RICH) ##坑爹玩意，靜態文本控件只能單行輸出，就是不能多行！網上查的可以通過“...XXX~r XXX..”這樣，
#但是相當無比麻煩，而且輸出都是亂的，除非一條條設置？可能嗎！！於是劍走偏鋒，選擇了用textCtr控件代替靜態文本，就是可以改變框里的值，但是效果確實達到了。
self.labelb = wx.StaticText(self.f,-1,text2,pos=(80,200),style=wx.ALIGN_LEFT)
self.labelc = wx.StaticText(self.f,-1,text3,pos=(80,220),style=wx.ALIGN_LEFT)
self.labeld = wx.StaticText(self.f,-1,text4,pos=(80,240),style=wx.ALIGN_LEFT)
self.labele = wx.StaticText(self.f,-1,text5,pos=(80,260),style=wx.ALIGN_LEFT)
self.labelf = wx.StaticText(self.f,-1,text6,pos=(80,280),style=wx.ALIGN_LEFT)

self.button_next=wx.Button(self.f,-1,"查看下一條",pos=(300,380))
self.button_exit=wx.Button(self.f,-1,"關閉",pos=(100,380))

self.Bind(wx.EVT_BUTTON,self.exit,self.button_exit)
self.Bind(wx.EVT_BUTTON,self.cont,self.button_next)
self.s.Show()

def exit(self,event):
self.s.Destroy()

def cont(self,event):
self.a += 1
self.s.Destroy()
self.weibo_info()

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
具體爬蟲部分，參考github某大佬的

# 獲取用戶昵稱
def GetName(self):
url = "https://weibo.cn/%d/info" % (self.Id)
html = requests.get(url, cookies=self.cookie).content
selector = etree.HTML(html) # 將返回的html文檔的標簽補足
username = selector.xpath("//title/text()")[0]# 獲取標簽title的所有內容。第一個title就是列表的一個
self.username = username[:-3] # XXX的微博，后面三個字切片即為用戶昵稱

# 獲取用戶微博數、關注數、粉絲數
def GetSimple_Info(self):

url = "https://weibo.cn/u/%d?&page=1" % (self.Id)
html = requests.get(url, cookies=self.cookie).content
selector = etree.HTML(html) # 轉化為標准的HTML
pattern = r"\d+\.?\d*"

# 微博數
wb_num = selector.xpath("//div[@class='tip2']/span[@class='tc']/text()")[0]# <div class="tip2"><span class="tc">微博[1543]</span>&nbsp
regx = re.findall(pattern, wb_num, re.S | re.M) # 只要數字（字符）
for value in regx:
num_wb = int(value)
break
self.Number = num_wb

# 關注數
str_gz = selector.xpath("//div[@class='tip2']/a/text()")[0]
regx = re.findall(pattern, str_gz, re.M)
self.Guanzhu = int(regx[0])

# 粉絲數
str_fs = selector.xpath("//div[@class='tip2']/a/text()")[1]
regx = re.findall(pattern, str_fs, re.M)
self.fans = int(regx[0])

# 獲取"長微博"全部文字內容
def GetLong(self, weibo_link):
html = requests.get(weibo_link, cookies=self.cookie).content
selector = etree.HTML(html)
info = selector.xpath("//div[@class='c']")[1]
wb_content = info.xpath("div/span[@class='ctt']")[0].xpath(
"string(.)").replace(u"\u200b", "").encode(sys.stdout.encoding, "ignore").decode(
sys.stdout.encoding)
return wb_content

# 獲取轉發微博信息
def GetZhuanfa(self, is_retweet, info, wb_content):
original_user = is_retweet[0].xpath("a/text()")
if not original_user:
wb_content = u"轉發微博已被刪除"
return wb_content
else:
original_user = original_user[0]
retweet_reason = info.xpath("div")[-1].xpath("string(.)").replace(u"\u200b", "").encode(
sys.stdout.encoding, "ignore").decode(
sys.stdout.encoding)
retweet_reason = retweet_reason[:retweet_reason.rindex(u"贊")]
wb_content = (retweet_reason + "\n" + u"原始用戶: " +
original_user + "\n" + u"轉發內容: " + wb_content)
return wb_content

#一個界面展示一條微博的發布時間、點贊數、轉發數、評論數
def weibo_para(self):

url = "https://weibo.cn/u/%d?&page=1" % (self.Id)
html = requests.get(url, cookies=self.cookie).content
selector = etree.HTML(html)
if selector.xpath("//input[@name='mp']") == []:
page_num = 1
else:
page_num = (int)(selector.xpath(
"//input[@name='mp']")[0].attrib["value"])
pattern = r"\d+\.?\d*"
for page in range(1, page_num + 1):
url2 = "https://weibo.cn/u/%d?&page=%d" % (
self.Id, page)
html2 = requests.get(url2, cookies=self.cookie).content
selector2 = etree.HTML(html2)
info = selector2.xpath("//div[@class='c']")
is_empty = info[0].xpath("div/span[@class='ctt']")
if is_empty:
for i in range(0, len(info) - 2):
# 微博內容
str_t = info[i].xpath("div/span[@class='ctt']")
Content = str_t[0].xpath("string(.)").replace(u"\u200b", "").encode(
sys.stdout.encoding, "ignore").decode(
sys.stdout.encoding)
Content = Content[:-1]
weibo_Id = info[i].xpath("@id")[0][2:]
a_link = info[i].xpath(
"div/span[@class='ctt']/a")
is_retweet = info[i].xpath("div/span[@class='cmt']")
if a_link:
if a_link[-1].xpath("text()")[0] == u"全文":
weibo_link = "https://weibo.cn/comment/" + weibo_Id
wb_content = self.GetLong(weibo_link)
if wb_content:
if not is_retweet:
wb_content = wb_content[1:]
Content = wb_content
if is_retweet:
Content = self.GetZhuanfa(
is_retweet, info[i], Content)
self.Content.append(Content)

# 微博發布時間
str_time = info[i].xpath("div/span[@class='ct']")
str_time = str_time[0].xpath("string(.)").encode(sys.stdout.encoding, "ignore").decode(
sys.stdout.encoding)
Time = str_time.split(u'來自')[0]
if u"剛剛" in Time:
Time = datetime.now().strftime('%Y-%m-%d %H:%M')
elif u"分鍾" in Time:
minute = Time[:Time.find(u"分鍾")]
minute = timedelta(minutes=int(minute))
Time = (datetime.now() - minute).strftime("%Y-%m-%d %H:%M")
elif u"今天" in Time:
today = datetime.now().strftime("%Y-%m-%d")
time = Time[3:]
Time = today + " " + time
elif u"月" in Time:
year = datetime.now().strftime("%Y")
month = Time[0:2]
day = Time[3:5]
time = Time[7:12]
Time = (
year + "-" + month + "-" + day + " " + time)
else:
Time = Time[:16]
self.Time.append(Time)

str_footer = info[i].xpath("div")[-1]
str_footer = str_footer.xpath("string(.)").encode(
sys.stdout.encoding, "ignore").decode(sys.stdout.encoding)
str_footer = str_footer[str_footer.rfind(u'贊'):]
regx = re.findall(pattern, str_footer, re.M)

# 微博發布工具
if len(str_time.split(u'來自')) > 1:
publish_tool = str_time.split(u'來自')[1]
else:
publish_tool = u"無"
self.publish_tool.append(publish_tool)

str_footer = info[i].xpath("div")[-1]
str_footer = str_footer.xpath("string(.)").encode(
sys.stdout.encoding, "ignore").decode(sys.stdout.encoding)
str_footer = str_footer[str_footer.rfind(u'贊'):]
guid = re.findall(pattern, str_footer, re.M)

# 點贊數
star = int(regx[0])
self.star.append(star)

# 轉發數
Zhuanfa = int(regx[1])
self.Zhuanfa.append(Zhuanfa)

# 評論數
Pinglun = int(regx[2])
self.Pinglun.append(Pinglun)
self.number1 += 1

# 將爬取的信息寫入文件--------------------------------------------------------------------------
def write_txt(self):
try:
contents_header = u"\n\n微博內容: \n"
contents = (u"用戶信息\n用戶昵稱：" + self.username +
u"\n用戶Id: " + str(self.Id) +
u"\n微博數: " + str(self.Number) +
u"\n關注數: " + str(self.Guanzhu) +
u"\n粉絲數: " + str(self.fans) + contents_header + '\n')

for i in range(1, self.number1 + 1):
text = (str(i) + ":" + self.Content[i - 1] + "\n" +
u"發布工具: " + self.publish_tool[i - 1] + "\n" +
u"發布時間: " + self.Time[i - 1] + "\n" +
u"點贊數: " + str(self.star[i - 1]) +
u"轉發數: " + str(self.Zhuanfa[i - 1]) +
u"評論數: " + str(self.Pinglun[i - 1]) + "\n\n")
contents = contents + text

global file_path
f = open(file_path, "wb")
f.write(contents.encode(sys.stdout.encoding))
f.close()

except Exception as e:
print("Error: ", e)
traceback.print_exc()

def main():
weibo = Wb()
weibo.Operate()
weibo.MainLoop()

if __name__ == "__main__":
main()
1
2
3
4
5
6
7
數據分析部分：用matplotlib制圖，只是粗淺學了一些，所以畫的不夠精美，數據過少，分析的可能有點問題，emmm，一共畫了四張圖， “2017-2018微博轉發/點贊量折線統計圖”、 ‘原創微博與轉發微博統計圖’ 、 ‘微博發布工具統計圖’、 ‘微博使用心情統計圖’

import re
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.dates as mdate
from matplotlib import font_manager as fm
import time
from datetime import datetime
import webbrowser

class analysis(object):

def __init__(self,file_name,number):
self.file_name = file_name
self.number = number
self.X_data = []
self.Y1_data = []
self.Y_data = []
self.str = ""

## 折線圖展示窗口
def analyse_Zhexian(self):
pattern = re.compile(r'轉發數: \d+') # 查找數字
pattern1 = re.compile(r'\d+')#匹配轉發數或者評論數的數字字符
pattern2 = re.compile(r'發布時間: (\d{4}-\d{1,2}-\d{1,2}\s\d{1,2}:\d{1,2})')#提取時間
pattern3 = re.compile(r'.*2016.*')
pattern4 = re.compile(r'點贊數: \d+')

with open(self.file_name,encoding = "utf-8") as f:
str = f.read()
# 用正則表達式提取所需數據
result = pattern.findall(str)
ls3 = ''.join(result)#拼接成一個字符串
Result = pattern1.findall(ls3)#把所有數字提取完畢
Num_Zhuanfa = [ int(x) for x in Result ] # y軸1

result1 = pattern4.findall(str)
ls1 = ''.join(result1)
Result1 = pattern1.findall(ls1)
Num_Dianzan = [int(x) for x in Result1] # y軸2

# 將x軸數據轉化為datetime 類型
Num_Zhuanfa_time = pattern2.findall(str)
for i in range(0,len(Num_Zhuanfa_time)):
if pattern3.findall(Num_Zhuanfa_time[i]):
stop = i # 用stop記錄2016年的最后一條微博
break

#只選擇2017-2018年兩年的數據，因為微博數太多，橫軸日期占比太大，matplotlib的橫坐標顯示不完全，中間會有大量重疊，這里數據分析的算法並不好，結果出來還是會有很大重疊，如果有更好的辦法請大佬指教
Num_Zhuanfa = Num_Zhuanfa[0:stop:1]
Num_Zhuanfa_time = Num_Zhuanfa_time[0:stop:1]
Num_Dianzan = Num_Dianzan[0:stop:1]

# 數據除以1000，畫圖更美觀
for i in range(0,len(Num_Zhuanfa)):
Num_Zhuanfa[i] = Num_Zhuanfa[i] /1000
for i in range(0,len(Num_Dianzan)):
Num_Dianzan[i] = Num_Dianzan[i] /1000

#將時間轉化為時間戳再轉化為datetime類型
aa=[time.strptime(i, "%Y-%m-%d %H:%M") for i in Num_Zhuanfa_time]
timeStamp = [int(time.mktime(a)) for a in aa]
Num_Zhuanfa_time=[datetime.fromtimestamp(k) for k in timeStamp]

# 處理數據量過多的問題
number = len(Num_Zhuanfa)
Group = int(0.18 * number)
k = number // Group # 數據太多，這里只要15%的數據，分組，每組隨機選一個作為代表數據
for i in range(0,Group):
self.X_data.append(Num_Zhuanfa_time[i*k])
self.Y_data.append(Num_Zhuanfa[i*k])
self.Y1_data.append(Num_Dianzan[i*k])

# 繪制兩條折線
fig1 = plt.figure(figsize=(8,5))
plt.rcParams['font.sans-serif'] = ['SimHei'] # 用來正常顯示中文標簽
ax1 = fig1.add_subplot(1,1,1)
ax1.xaxis.set_major_formatter(mdate.DateFormatter('%Y-%m-%d %H-%M'))#設置時間標簽顯示格式
plt.xticks(self.X_data,rotation=90)#豎着輸出時間
plt.yticks(np.linspace(0,5000,5,endpoint=True))
plt.title(u"2017-2018微博轉發/點贊量折線圖",color="black")
plt.plot(self.X_data,self.Y_data,"o-",color='skyblue',label="轉發量",markersize=1.5) #折線
plt.plot(self.X_data,self.Y1_data,"o-",color='pink',label="點贊量",markersize=1.5)
plt.xlabel("發布時間")
plt.ylabel("數量(千/條)")
plt.legend() # 顯示標簽
plt.show()

def analyse_YC(self):
pattern = re.compile(r'轉發理由')

with open(self.file_name,encoding = "utf-8") as f:
str = f.read()
Zhuanfa = pattern.findall(str)
Number_Zhuanfa = int(len(Zhuanfa))
Yuanchuang = self.number - Number_Zhuanfa

plt.rcParams['font.sans-serif'] = ['SimHei']
labels = ['轉發微博','原創微博']
sizes = [Number_Zhuanfa,Yuanchuang]
explode= (0.1,0)
plt.pie(sizes,explode=explode,labels=labels,autopct='%1.1f%%',shadow=False,startangle=150)
plt.title(u"原創與轉發微博量",color="black")
plt.show()

def analyse_GJ(self):
pattern = re.compile(r'發布工具: (.*)\n發布時間')
with open(self.file_name,encoding = "utf-8") as f:
str = f.read()
number_GJ = pattern.findall(str)
#print(number_GJ)
gongju = dict()
for i in number_GJ:
name = i
if name in gongju:
gongju[name]+=1
else:
gongju[name]=1

# 少於10的記錄舍去
for key in list(gongju.keys()):
if gongju[key]<=10:
del gongju[key]

labels = list(gongju.keys())
sizes = list(gongju.values())
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.pie(sizes,labels=labels,autopct='%1.1f%%',shadow=True,startangle=150)
plt.title(u"微博發布工具統計",color="black")
plt.show()

def analyse_XQ(self):
pattern = re.compile(r'\[(.{1,4})\].*\[(.{1,4})\]')
with open(self.file_name,encoding = "utf-8") as f:
str = f.read()
number_XQ = pattern.findall(str)
# print(number_XQ)
a=[]
for i in range(0,len(number_XQ)):
for j in (range(0,len(number_XQ[i]))):
a.append(number_XQ[i][j])

biaoqing = dict()

for i in a:
name = i
if name in biaoqing:
biaoqing[name]+=1
else:
biaoqing[name]=1

for key in list(biaoqing.keys()):
if biaoqing[key] <= 2:
del biaoqing[key]

labels = list(biaoqing.keys())
sizes = list(biaoqing.values())
fig1, ax1 = plt.subplots(http://www.my516.com)
patches, texts, autotexts = ax1.pie(sizes, labels=labels, autopct='%1.0f%%',
shadow=False, startangle=170)
ax1.axis('equal')
#重新設置字體大小
plt.rcParams['font.sans-serif'] = ['SimHei']
proptease = fm.FontProperties()
proptease.set_size('small')
plt.title(u"微博表情使用次數",color="black")
plt.setp(autotexts, fontproperties=proptease)
plt.setp(texts, fontproperties=proptease)
plt.show()

#在cmd下安裝pyinstaller
pip install pyinstaller
#打包成一個可執行文件 -F (注意將cmd窗口切換至文件保存的路徑下)
pyinstaller -F filename.py

1
2
3
4
5
本篇只適合新手簡單學習，筆者也剛學，加上復習周，后期會逐漸完善，畢竟UI寫的太丑了！
另：關於獲取本地用戶cookie和微博賬號的id操作比較簡單在此不再做詳細解釋。如果程序跑不出來相信我一定是cookie問題
---------------------

免責聲明！

本站轉載的文章為個人學習借鑒使用，本站對版權不負任何法律責任。如果侵犯了您的隱私權益，請聯系本站郵箱yoyou2525@163.com刪除。

猜您在找 python數據分析大作業數據分析大作業利用Python進行簡單數據分析--醫院銷售數據分析案例基於ELK的簡單數據分析 Python數據分析與爬蟲 Hadoop實戰：微博數據分析對微博熱搜的爬取及數據分析 Python爬蟲實戰，openpyxl模塊學習，爬取房價信息並簡單的數據分析 Python數據分析系列之——王一博微博轉發量分析1 數據說明2 粉絲結構初步分析3 粉絲畫像最后的話【Python數據分析】簡單爬蟲爬取知乎神回復