80萬商城運營大數據分析
- 用戶行為分析:日訪問量、小時訪問量、不同行為類型訪問量
- 獲客分析
- 用戶留存分析
- 復購分析
- 轉化漏斗分析
- 商品競爭力分析:性別、城市、用戶瀏覽、購買、收藏、商品類目分析
一:導入數據
# 導入常用的庫: import pandas as pd import numpy as np import matplotlib.pyplot as plt import datetime plt.rcParams['font.sans-serif'] = ['SimHei'] plt.rcParams['axes.unicode_minus'] = False # columns = ['user_id','goods_id','cat','behavior','time'] df = pd.read_table("E:/yizhiamumu/behavior.txt") df.info()
打印結果
<class 'pandas.core.frame.DataFrame'> RangeIndex: 757565 entries, 0 to 757564 Data columns (total 10 columns): user_id 757565 non-null int64 goods_id 757565 non-null int64 cat 757565 non-null int64 behavior 757565 non-null object time 757565 non-null int64 sex 757565 non-null int64 addr 757565 non-null object device 757565 non-null object price 757565 non-null float64 amount 757565 non-null int64 dtypes: float64(1), int64(6), object(3) memory usage: 57.8+ MB
觀察數據
df.head()
打印結果
user_id goods_id cat behavior time sex addr device price amount
0 1 5002615 2520377 pv 1574911385 0 成都 Redmi Note8Pro 0.0 0 1 1 2734026 4145813 pv 1574914184 0 成都 Redmi Note8Pro 0.0 0 2 1 5002615 2520377 pv 1574916273 0 成都 Redmi Note8Pro 0.0 0 3 1 3239041 2355072 pv 1574927664 0 成都 Redmi Note8Pro 0.0 0 4 1 4615417 4145813 pv 1574942864 0 成都 Redmi Note8Pro 0.0 0
二:數據清洗
# 數據清廷 df.drop_duplicates(inplace=True) # 時間戳處理 df['time1'] = df['time'].apply(lambda x:datetime.datetime.fromtimestamp(x)) df['month'] = df['time'].apply(lambda x:int(datetime.datetime.fromtimestamp(x).strftime("%Y%m"))) df['date'] = df['time'].apply(lambda x:datetime.datetime.fromtimestamp(x).strftime("%Y%m%d")) df['hour'] = df['time1'].apply(lambda x:x.hour) # 銷售金額計算 df['money'] = df['price'] * df['amount'] # 缺失值處理 df.isnull().sum() df.describe()
打印結果
user_id goods_id cat time sex price amount month hour money count 757565.000000 7.575650e+05 7.575650e+05 7.575650e+05 757565.000000 757565.000000 757565.000000 757565.000000 757565.000000 757565.000000 mean 5583.487645 2.582329e+06 2.716578e+06 1.575162e+09 0.432664 0.955333 0.047983 201911.560921 14.953452 2.383729 std 3184.742636 1.487073e+06 1.465113e+06 1.494278e+05 0.495445 7.915329 0.376215 0.496275 6.063847 21.677706 min 1.000000 2.900000e+01 2.171000e+03 1.574870e+09 0.000000 0.000000 0.000000 201911.000000 0.000000 0.000000 25% 2824.000000 1.304276e+06 1.354236e+06 1.575031e+09 0.000000 0.000000 0.000000 201911.000000 11.000000 0.000000 50% 5671.000000 2.582878e+06 2.735466e+06 1.575181e+09 0.000000 0.000000 0.000000 201912.000000 16.000000 0.000000 75% 8310.000000 3.861863e+06 4.145813e+06 1.575291e+09 1.000000 0.000000 0.000000 201912.000000 20.000000 0.000000 max 11079.000000 5.163006e+06 5.161669e+06 1.575389e+09 1.000000 99.900000 4.000000 201912.000000 23.000000 399.600000
三:數據可視化
1 每天PV UV 走勢分析
# 數據可視化 # 1 每天PV UV 走勢分析 all_puv = pd.pivot_table(df, index=['date'],values='user_id',aggfunc='count') uv = df[['user_id','date']].drop_duplicates()['date'].value_counts() all_puv = all_puv.join(uv) all_puv.columns = ['pv','uv'] all_puv['avg_pv'] = all_puv['pv']/all_puv['uv'] all_puv
打印結果
pv uv avg_pv
date
20191128 106877 7610 14.044284 20191129 111426 7811 14.265267 20191130 114328 7874 14.519685 20191201 121457 8097 15.000247 20191202 153545 10552 14.551270 20191203 149932 10530 14.238557
x=all_puv.index
fig,axes = plt.subplots(1,3,figsize=(18,3)) axes[0].plot(x,all_puv['pv'], color='r', marker='o') axes[1].plot(x,all_puv['uv'], color='g', marker='s') axes[2].plot(x,all_puv['avg_pv'], color='b', marker='d') axes[0].set_title('pv') axes[1].set_title('uv') axes[2].set_title('avg_pv') plt.show()
打印結果
2 用戶行為 pv 分析
# 2 用戶行為 pv 分析 pv = pd.pivot_table(df, index=['date'], columns=['behavior'], values='user_id', aggfunc='count') pv['all'] = pv.sum(axis=1) plt.figure(figsize=(12,5)) plt.plot(pv.index, pv['all']) plt.plot(pv.index, pv['pv'],color='r') plt.plot(pv.index, pv['cart'],color='y') plt.plot(pv.index, pv['buy'], color='c') plt.plot(pv.index, pv['fav'], color='b') plt.xlabel("日期", fontsize=20) plt.ylabel('pv', fontsize=20) plt.title("用戶行為pv 分析", fontsize=24) plt.legend(["all","pv","cart","buy","fav"], loc = 'upper left', fontsize=20) plt.show()
打印結果
3 訪問高峰分析
# 3 訪問高峰分析 hour_pv = df['hour'].value_counts().reset_index().rename(columns={'index':'hour', 'hour':'pv'}) plt.figure(figsize=(12,4)) plt.bar(hour_pv['hour'], hour_pv['pv'],color='r') plt.xlabel("小時", fontsize=20) plt.ylabel("訪問量pv", fontsize=20) plt.title("每小時用戶訪問量數據分析", fontsize = 24) plt.show()
4 新用戶分析
# 4 新用戶分析 new_user= df[['user_id','date']].groupby('user_id').min()['date'].value_counts().reset_index() new_user.columns = ['date', 'new_user'] plt.figure(figsize=(10,6)) x,y = new_user['date'], new_user['new_user'] plt.bar(x,y,width=0.6,color='r') for a, b in zip(x,y): plt.text(a,b + 0.05, '%.0f' % b, ha='center', va='bottom', fontsize=20) plt.xlabel("日期", fontsize=16) plt.ylabel("新用戶數", fontsize=16) plt.title("每日新增用戶數", fontsize=20) plt.show()
5 用戶留存分析
# 5 用戶留存分析 # 建立 n 日留存率計算模型,數據傳入用戶id 和登錄日期 # n 為n 日留存, 不傳入start_date 和 n 時,則計算所有留存 def cal_retention(df, start_date='20190101',n=0): if n>0: new_user = df[['user_id','date']].groupby('user_id').min().reset_index() date2 = datetime.datetime.strptime(start_date, '%Y%m%d')+datetime.timedelta(n) end_date = datetime.datetime.strftime(date2,'%Y%m%d') start_user = set(new_user[new_user.date==start_date].user_id) end_user = set(df[df.date==end_date].user_id) user = start_user&end_user return [start_date, end_date, len(start_user),len(user),round(len(user)/len(start_user),4)] else: new_user = df[['user_id',"date"]].groupby('user_id').min().reset_index() date_source = new_user.date.unique() date_source.sort() result1 = [] flag =0 for start_date in date_source: start_user = set(new_user[new_user.date == start_date].user_id) for end_date in date_source[flag:]: end_user = set(df[df.date==end_date].user_id) user = start_user&end_user result1.append([start_date,end_date,len(start_user),len(user),round(len(user)/len(start_user),4)]) flag = flag+1 return pd.DataFrame(result1, columns=['開始日期','留存日期','新用戶數','留存人數','留存率']) # 調用cal_retention 函數計算留存 # cal_retention(df[['user_id','date']],'20191128',3) retention = cal_retention(df[['user_id','date']]) # 留存人數展示 pd.pivot_table(retention, index=['開始日期'],columns=['留存日期'],values='留存人數',aggfunc='sum',fill_value=0) # 留存率展示 pd.pivot_table(retention, index=['開始日期'],columns=['留存日期'],values='留存率',aggfunc='sum',fill_value=0)
6 復購率計算
# 6 復購率計算 data_buy = df[df.behavior=='buy'][['user_id','date']].drop_duplicates()['user_id'].value_counts().reset_index() data_buy.columns = ['user_id','num'] re_buy_rate = round(len(data_buy[data_buy.num>=2])/len(data_buy),4) print("復購率為:",round(re_buy_rate*100,2),"%") # 購買總人數 buy_user = len(data_buy) # 網聯次數的人數分布 buy_freq = data_buy.num.value_counts().reset_index() buy_freq.columns = [['購買次數','人數']] buy_freq['人數占比'] = round(buy_freq['人數']/buy_user,4) buy_freq
打印結果
復購率為: 42.06 % 購買次數 人數 人數占比 0 1 3540 0.5794 1 2 1714 0.2805 2 3 575 0.0941 3 4 210 0.0344 4 5 57 0.0093 5 6 14 0.0023
7 商品TOP 分析
7.1 商品銷售TOP
# 7.1 商品銷售TOP buy_top = df[df.behavior=='buy']['goods_id'].value_counts().head(10) from pyecharts.charts import Bar from pyecharts import options as opts bar = Bar() bar.add_xaxis(buy_top.index.tolist()) bar.add_yaxis("商品銷售TOP",buy_top.values.tolist(),color='red') bar.set_global_opts(title_opts=opts.TitleOpts(title="商品銷售TOP")) bar.render_notebook()
7.2商品瀏覽TOP
# 7.2商品瀏覽TOP pv_top = df[df.behavior=='pv']['goods_id'].value_counts().head(10) from pyecharts.charts import Bar from pyecharts import options as opts bar = Bar() bar.add_xaxis(pv_top.index.tolist()) bar.add_yaxis("商品瀏覽TOP",pv_top.values.tolist(),color='red') bar.set_global_opts(title_opts=opts.TitleOpts(title="商品瀏覽TOP")) bar.render_notebook()
7.3商品收藏 TOP
# 7.3商品收藏 TOP fav_top = df[df.behavior=='fav']['goods_id'].value_counts().head(10) from pyecharts.charts import Bar from pyecharts import options as opts bar = Bar() bar.add_xaxis(fav_top.index.tolist()) bar.add_yaxis("商品收藏TOP",fav_top.values.tolist(),color='red') bar.set_global_opts(title_opts=opts.TitleOpts(title="商品收藏TOP")) bar.render_notebook()
8 城市購買競爭力TOP
# 8 城市購買競爭力TOP city_top = df[df.behavior=='buy'][['addr','money']].groupby('addr').sum().sort_values('money',ascending=False) bar = Bar() bar.add_xaxis(city_top.index.tolist()) bar.add_yaxis("城市購買競爭力TOP",[round(a,2) for a in city_top.money], color='red') bar.set_global_opts(title_opts=opts.TitleOpts(title="城市購買競爭力TOP")) bar.render_notebook()
9 不同性別購買力分析
# 9 不同性別購買力情況 0-女,1-男 sex_top = df[df.behavior=='buy'][['sex','money']].groupby('sex').sum() from pyecharts.charts import Pie pie = Pie() pie.add("",[list(z) for z in zip(['女','男'], [round(a,2) for a in city_top.money])]) pie.set_global_opts(title_opts=opts.TitleOpts(title="不同性別購買力分析")) pie.set_series_opts(label_opts=opts.LabelOpts(formatter="{b}:{c} ({d}%)")) pie.render_notebook()
10商品類目競爭力分析
# 10商品類目競爭力分析 cat_top = df[df.behavior=='buy']['cat'].value_counts().head(10) bar = Bar() bar.add_xaxis(cat_top.index.tolist()) bar.add_yaxis("商品類目競爭力分析",cat_top.values.tolist()) bar.render_notebook()
11 總體轉化漏斗分析
# 11 總體轉化漏斗分析 from pyecharts.charts import Funnel from pyecharts import options as opts data_behavior = df[df.behavior!="fav"]['behavior'].value_counts().reset_index().rename(columns={"index":"環節","behavior":"人數"}) # 單一環節的轉化率 t1 = np.array(data_behavior['人數'][1:]) t2 = np.array(data_behavior['人數'][0:-1]) single_convs = t1/t2 single_convs = list(single_convs) single_convs.insert(0,1) # 總體轉化率 flag = data_behavior['人數'][0] data_behavior['總體轉化率'] = data_behavior['人數']/flag attrs = [a+": "+str(round(b,2))+"%" for a,b in zip(data_behavior['環節'],data_behavior['總體轉化率']*100)] attr_value = [round(a,2) for a in data_behavior['總體轉化率'] * 100] funnel = Funnel() funnel.add("商品",[list(z) for z in zip(attrs, attr_value)], label_opts = opts.LabelOpts(position="inside")) funnel.set_global_opts(title_opts=opts.TitleOpts(title="總體轉化漏斗數據分析")) funnel.render_notebook()
12 RFM 用戶分層模型
# RFM 模型打分規則 def recency(x): if x<=2: return 5 elif x==3: return 4 elif x==4: return 3 elif x==5: return 2 elif x>=6: return 1 def frequency(x): if x>=8: return 5 elif (x>=6)&(x<8): return 4 elif (x>=4)&(x<6): return 3 elif (x>=2)&(x<4): return 2 elif (x>=0)&(x<2): return 1 def monetary(x): if x>=300: return 5 elif (x>=200)&(x<300): return 4 elif (x>100)&(x<200): return 3 elif (x>50)&(x<100): return 2 elif (x>=0)&(x<50): return 1 # RFM 用戶分層 RFM_date = df[df.behavior == 'buy'][['user_id','date']].groupby("user_id").max() RFM_F = df[df.behavior == 'buy'][['user_id','behavior']].groupby('user_id').count() RFM_M = df[df.behavior == 'buy'][['user_id','money']].groupby('user_id').mean() RFM = RFM_date.join(RFM_F).join(RFM_M) # 用戶價值分層 end_date = datetime.datetime.strptime('20191205',"%Y%m%d") # 時間間隔天數計算 RFM['days'] = RFM['date'].apply(lambda x:(end_date-datetime.datetime.strptime(x,"%Y%m%d")).days) RFM = RFM[['days','behavior','money']] RFM.columns = ['間隔天數','消費頻次','消費金額'] RFM.head()
打印結果:
間隔天數 消費頻次 消費金額 user_id 2 3 7 186.742857 4 5 4 142.600000 16 4 2 84.650000 17 4 1 62.700000 20 4 1 37.900000
RFM數據建模:
# RFM 數據處理 RFM['R_S'] = RFM['間隔天數'].apply(recency) RFM['F_S'] = RFM['消費頻次'].apply(frequency) RFM['M_S'] = RFM['消費金額'].apply(monetary) RFM['RFM'] = RFM.apply(lambda x:int(x.R_S * 100 + x.F_S * 10 + x.M_S),axis=1) RFM.head()
打印結果
間隔天數 消費頻次 消費金額 R_S F_S M_S RFM user_id 2 3 7 186.742857 4 4 3.0 443 4 5 4 142.600000 2 3 3.0 233 16 4 2 84.650000 3 2 2.0 322 17 4 1 62.700000 3 1 2.0 312 20 4 1 37.900000 3 1 1.0 311
歡迎關注:一只阿木木