1.背景及問題
現某IT產品銷售公司,有一定量的小公司水平的用戶,這些用戶在做出購買時,會接觸到銷售公司的多個營銷渠道,不同的渠道上投入怎樣分配,以實現營銷效益的最大化,便成為了很多公司的市場營銷部門亟需解決的問題。
即:找出轉化率最高的渠道路徑或方式
2.思路步驟
-
線性模型分析
-
馬爾科夫鏈分析
-
可視化馬爾科夫鏈
轉換率計算
-
第一次點擊 用戶訪問路徑上的第一個觸點獲取所有貢獻值
-
最后一次點擊 用戶購買之前最后一個觸點獲取所有貢獻值
-
線性模型分析 用戶訪問路徑上的所有觸點平分貢獻值
-
馬爾科夫鏈 馬爾科夫鏈的轉移矩陣 -> 每個觸點的移除效應-> 觸點貢獻值
3.數據集介紹
Id: 某IT產品銷售公司的客戶,客戶類型是小公司
Segment: 客戶的畫像
Channel:客戶生命周期中觸及過的渠道; DM(直郵),EM(電子郵件), PHONE(電話)和 WEB(產品銷售官網瀏覽記錄)
Date: 客戶觸及渠道的日期,觸及時間長度為1年
Pur_flag: 等於1表示該客戶在接觸完相應渠道后,完成了IT產品的購買
4.代碼及具體步驟
導入模塊
import numpy as np
import pandas as pd
import networkx as nx
from pprint import pprint
import os
import matplotlib.pyplot as plt
os.chdir(r'C:/Users/pc/Desktop/數據分析項目/客戶轉化分析/')
# 導入數據,將第四列解析為日期格式
df = pd.read_csv(r'./ChannelAttribute.csv', parse_dates=[3])
df.head()
id | segment | channel | date | pur_flag | |
---|---|---|---|---|---|
0 | 20398764672 | Tier 2 | DM | 2018-03-19 | 0 |
1 | 20408399343 | Tier 2 | WEB | 2017-09-27 | 0 |
2 | 20438922645 | Tier 2 | WEB | 2017-11-15 | 0 |
3 | 20225918468 | Tier 2 | DM | 2017-05-24 | 0 |
4 | 20278581048 | Tier 3 | DM | 2018-04-23 | 0 |
創建路徑數據
def create_path_with_value(data, element):
path = []
path.append('start')
df2 = data.loc[data['id'] == element, :].sort_values(['id', 'date'], ascending=[False, True])
for i in range(len(df2)):
path.append(df2.iloc[i]['channel'])
if df2['pur_flag'].unique() == 1:
path.append('conversion')
conv = 1
conv_null = 0
else:
path.append('null')
conv = 0
conv_null = 1
return [path, conv, conv_null]
final_path, conv, conv_null = [], [], []
for element in df['id'].unique():
rst = create_path_with_value(df, element)
final_path.append(rst[0])
conv.append(rst[1])
conv_null.append(rst[2])
# 路徑數據
path_data = pd.DataFrame({'path': final_path, 'conv': conv, 'conv_nulls': conv_null})
path_data.head(10)
path | conv | conv_nulls | |
---|---|---|---|
0 | [start, WEB, EM, DM, null] | 0 | 1 |
1 | [start, EM, WEB, DM, null] | 0 | 1 |
2 | [start, WEB, EM, DM, null] | 0 | 1 |
3 | [start, DM, EM, WEB, null] | 0 | 1 |
4 | [start, EM, WEB, DM, null] | 0 | 1 |
5 | [start, PHONE, EM, DM, WEB, null] | 0 | 1 |
6 | [start, PHONE, WEB, DM, null] | 0 | 1 |
7 | [start, DM, PHONE, WEB, null] | 0 | 1 |
8 | [start, WEB, EM, DM, conversion] | 1 | 0 |
9 | [start, PHONE, WEB, DM, null] | 0 | 1 |
歸因分析(最后一次點擊、第一次點擊和線性模型)
def create_last_click_stats_pair(data):
temp_path, temp_conv = [], []
for i in range(len(data)):
temp_path.append(data.iloc[i]['path'][-2])
temp_conv.append(data.iloc[i]['conv'])
return pd.DataFrame({'touch': temp_path, 'Last_Conv': temp_conv})
def create_first_order_states_pair(data):
temp_path, temp_conv = [], []
for i in range(len(data)):
temp_path.append(data.iloc[i]['path'][1])
temp_conv.append(data.iloc[i]['conv'])
return pd.DataFrame({'touch': temp_path, 'First_Conv': temp_conv})
def create_linear_click_stats_pair(data):
temp_path, temp_conv = [], []
for i in range(len(data)):
if len(data.iloc[i]['path'])==6:
for j in range(1,5):
temp_path.append(data.iloc[i]['path'][j])
temp_conv.append(data.iloc[i]['conv'] / (6 - 2))
elif len(data.iloc[i]['path'])==5:
for j in range(1,4):
temp_path.append(data.iloc[i]['path'][j])
temp_conv.append(data.iloc[i]['conv'] / (5 - 2))
elif len(data.iloc[i]['path'])==4:
for j in range(1,3):
temp_path.append(data.iloc[i]['path'][j])
temp_conv.append(data.iloc[i]['conv'] / (4 - 2))
else:
for j in range(1, 2):
temp_path.append(data.iloc[i]['path'][j])
temp_conv.append(data.iloc[i]['conv'] / (3 - 2))
return pd.DataFrame({'touch': temp_path, 'Linear_Conv': temp_conv})
last_touch = create_last_click_stats_pair(path_data).groupby('touch')['Last_Conv'].sum().reset_index()
linear_touch = create_linear_click_stats_pair(path_data).groupby('touch')['Linear_Conv'].sum().reset_index()
first_touch = create_first_order_states_pair(path_data).groupby('touch')['First_Conv'].sum().reset_index()
lst = last_touch.set_index('touch').iloc[:, 0:].apply(lambda x: x / x.sum())
li = linear_touch.set_index('touch').iloc[:, 0:].apply(lambda x: x / x.sum())
fst = first_touch.set_index('touch').iloc[:, 0:].apply(lambda x: x / x.sum())
dfs = [fst, lst, li]
dfs = [df for df in dfs]
dfs[0].join(dfs[1:])
First_Conv | Last_Conv | Linear_Conv | |
---|---|---|---|
touch | |||
DM | 0.341152 | 0.744850 | 0.504964 |
EM | 0.278233 | 0.097915 | 0.198478 |
PHONE | 0.094440 | 0.014768 | 0.050488 |
WEB | 0.286175 | 0.142467 | 0.246070 |
**線性模型分析結論: 相比於其他渠道,DM(直郵)是轉化率較優的渠道**
馬爾科夫鏈
# 手動計算狀態轉移矩陣
def split_states(data):
temp_data = []
for i in range(len(data)):
path = data.iloc[i]['path']
state_pairs, values = [], []
for j in range(len(path)-1):
state_pairs.append((path[j], path[j+1]))
values.append(1)
temp_data.append([state_pairs, values])
return temp_data
temps = split_states(path_data)
temps[0:3]
[[[('start', 'WEB'), ('WEB', 'EM'), ('EM', 'DM'), ('DM', 'null')],
[1, 1, 1, 1]],
[[('start', 'EM'), ('EM', 'WEB'), ('WEB', 'DM'), ('DM', 'null')],
[1, 1, 1, 1]],
[[('start', 'WEB'), ('WEB', 'EM'), ('EM', 'DM'), ('DM', 'null')],
[1, 1, 1, 1]]]
def transition_maxtrix(data):
state_pairs, values = [], []
for i in range(len(data)):
for j, z in zip(data[i][0], data[i][1]):
state_pairs.append(j)
values.append(z)
temp_df = pd.DataFrame({'state_pairs': state_pairs, 'values': values})
grp_df = temp_df.groupby('state_pairs')['values'].sum().reset_index()
grp_df[['start', 'end']] = grp_df['state_pairs'].apply(pd.Series)
table = pd.crosstab(grp_df['end'], grp_df['start'], values=grp_df['values'], aggfunc=np.sum, normalize='columns')\
.applymap(lambda x: "{:3.2f}".format(x))
return table
# 輸出狀態轉移矩陣
tmp = transition_maxtrix(temps)
tmp1 = tmp.transpose()
tmp1
end | DM | EM | PHONE | WEB | conversion | null |
---|---|---|---|---|---|---|
start | ||||||
DM | 0.00 | 0.09 | 0.01 | 0.12 | 0.21 | 0.57 |
EM | 0.43 | 0.00 | 0.02 | 0.41 | 0.04 | 0.09 |
PHONE | 0.18 | 0.11 | 0.00 | 0.69 | 0.01 | 0.01 |
WEB | 0.58 | 0.22 | 0.03 | 0.00 | 0.05 | 0.12 |
start | 0.14 | 0.29 | 0.32 | 0.25 | 0.00 | 0.00 |
計算渠道的移除效應
def channel_remove(data,channel_removed):
state_pairs, values = [], []
for i in range(len(data)):
for j, z in zip(data[i][0], data[i][1]):
state_pairs.append(j)
values.append(z)
temp_df = pd.DataFrame({'state_pairs': state_pairs, 'values': values})
grp_df = temp_df.groupby('state_pairs')['values'].sum().reset_index()
grp_df[['start', 'end']] = grp_df['state_pairs'].apply(pd.Series)
temp = grp_df.copy()
grp_df['start'] = grp_df['start'].replace(channel_removed, 'unknown')
grp_df['end'] = grp_df['end'].replace(channel_removed, 'unknown')
return [grp_df, temp]
# 篩選出成功轉化路徑
path_data_pur = path_data[path_data['conv']==1]
temps = split_states(path_data_pur)
conversion =[]
columns = ['start', 'end', 'values_x', 'values_y', 'perct']
# 所有渠道
channels_list = list(df['channel'].unique())
df_dummy1 = pd.DataFrame({'start': ['start', 'conversion', 'null'],
'end': ['start', 'conversion', 'null'],
'values_x': [0, 0, 0],
'values_y': [0, 0, 0],
'perct': [0, 1, 1]})
dy_dummy = pd.DataFrame(df_dummy1, columns=columns)
df_dummy2 = pd.DataFrame({'start': ['start', 'conversion', 'null'],
'end': ['start', 'conversion', 'null']})
# 逐個計算移除單個渠道后的總轉化數
for chnl in channels_list:
df_remove = channel_remove(temps, chnl)[0]
df_noremove = channel_remove(temps, chnl)[1]
df_temp = df_remove.groupby('start')['values'].sum().reset_index()
df_temp = pd.merge(df_remove, df_temp, on='start', how='left')
df_temp['perct'] = df_temp['values_x']/df_temp['values_y']
df_temp = pd.DataFrame(df_temp, columns=columns)
df_temp = pd.concat([df_temp, dy_dummy], axis=0)
df_ini = pd.DataFrame(df_noremove, columns=['start', 'end'])
df_temp2 = pd.concat([df_ini, df_dummy2], axis=0)
df_temp = pd.merge(df_temp2, df_temp, on=['start', 'end'], how='left')
# 用0填充由於左連接出現的NaN
df_temp['values_x'].fillna(0, inplace=True)
df_temp['values_y'].fillna(0, inplace=True)
df_temp['perct'].fillna(0, inplace=True)
df_trans1 = pd.crosstab(df_temp['start'], df_temp['end'], values=df_temp['perct'], aggfunc=np.sum)
df_trans1.update(df_trans1[['DM', 'EM', 'PHONE', 'WEB', 'conversion', 'null', 'start']].fillna(0))
# 轉化為numpy矩陣
df_trans_mat = np.matrix(df_trans1)
inist_n1 = pd.crosstab(df_temp['start'], df_temp['end'], values=df_temp['values_x'], aggfunc=np.sum)
inist_n1.update(inist_n1[['DM', 'EM', 'PHONE', 'WEB', 'conversion', 'null', 'start']].fillna(0))
inist_mat = np.matrix(inist_n1.iloc[-1])
# 矩陣乘積
mat = inist_mat*df_trans_mat
# 取出估計出來的轉化數
conversion.append(mat[0,4])
# 計算單個渠道的移除效應
chnl_conversion = pd.DataFrame({'channel': channels_list, 'conv': conversion})
df_remove = channel_remove(temps, chnl)[0]
df_noremove = channel_remove(temps, chnl)[1]
tot_conv = df_remove['values'].sum()
chnl_conversion['impact'] = (tot_conv-chnl_conversion['conv'])/tot_conv
tot_impact = chnl_conversion['impact'].sum()
chnl_conversion['convet_rate']= chnl_conversion['impact']/tot_impact
chnl_conversion
channel | conv | impact | convet_rate | |
---|---|---|---|---|
0 | DM | 1003.039274 | 0.961468 | 0.265241 |
1 | WEB | 2724.928034 | 0.895320 | 0.246993 |
2 | EM | 2856.954317 | 0.890248 | 0.245594 |
3 | PHONE | 3179.825240 | 0.877845 | 0.242172 |
馬爾科夫鏈模型分析結論: 相比於其他渠道,DM(直郵)的轉換貢獻值最高,是轉化率較優的渠道
可視化馬爾科夫鏈
states = ['start', 'DM', 'EM', 'PHONE', 'WEB', 'conversion', 'null']
def _get_markov_edges(Q):
edges = {}
for col in Q.columns:
for idx in Q.index:
edges[(idx,col)] = Q.loc[idx,col]
return edges
edges_wts = _get_markov_edges(tmp1)
edges_wts
{('DM', 'DM'): '0.00',
('EM', 'DM'): '0.43',
('PHONE', 'DM'): '0.18',
('WEB', 'DM'): '0.58',
('start', 'DM'): '0.14',
('DM', 'EM'): '0.09',
('EM', 'EM'): '0.00',
('PHONE', 'EM'): '0.11',
('WEB', 'EM'): '0.22',
('start', 'EM'): '0.29',
('DM', 'PHONE'): '0.01',
('EM', 'PHONE'): '0.02',
('PHONE', 'PHONE'): '0.00',
('WEB', 'PHONE'): '0.03',
('start', 'PHONE'): '0.32',
('DM', 'WEB'): '0.12',
('EM', 'WEB'): '0.41',
('PHONE', 'WEB'): '0.69',
('WEB', 'WEB'): '0.00',
('start', 'WEB'): '0.25',
('DM', 'conversion'): '0.21',
('EM', 'conversion'): '0.04',
('PHONE', 'conversion'): '0.01',
('WEB', 'conversion'): '0.05',
('start', 'conversion'): '0.00',
('DM', 'null'): '0.57',
('EM', 'null'): '0.09',
('PHONE', 'null'): '0.01',
('WEB', 'null'): '0.12',
('start', 'null'): '0.00'}
# 移除0轉化概率的邊
for key, value in list(edges_wts.items()):
if value == '0.00':
edges_wts.pop(key)
pprint(edges_wts)
{('DM', 'EM'): '0.09',
('DM', 'PHONE'): '0.01',
('DM', 'WEB'): '0.12',
('DM', 'conversion'): '0.21',
('DM', 'null'): '0.57',
('EM', 'DM'): '0.43',
('EM', 'PHONE'): '0.02',
('EM', 'WEB'): '0.41',
('EM', 'conversion'): '0.04',
('EM', 'null'): '0.09',
('PHONE', 'DM'): '0.18',
('PHONE', 'EM'): '0.11',
('PHONE', 'WEB'): '0.69',
('PHONE', 'conversion'): '0.01',
('PHONE', 'null'): '0.01',
('WEB', 'DM'): '0.58',
('WEB', 'EM'): '0.22',
('WEB', 'PHONE'): '0.03',
('WEB', 'conversion'): '0.05',
('WEB', 'null'): '0.12',
('start', 'DM'): '0.14',
('start', 'EM'): '0.29',
('start', 'PHONE'): '0.32',
('start', 'WEB'): '0.25'}
# 用networkx繪制馬爾科夫鏈
os.environ["PATH"] += os.pathsep + './graphviz-2.38/release/bin/'
G = nx.MultiDiGraph()
# 增加節點狀態
G.add_nodes_from(states)
print('Nodes:\n{G.nodes()}\n')
# 邊表示轉換概率
for k, v in edges_wts.items():
tmp_origin, tmp_destination = k[0], k[1]
G.add_edge(tmp_origin, tmp_destination, weight=v, label=v)
print('Edges:')
pprint(G.edges(data=True))
pos = nx.drawing.nx_pydot.graphviz_layout(G, prog='dot')
nx.draw_networkx(G, pos)
# 創建邊標簽
edge_labels = {(n1,n2):d['label'] for n1,n2,d in G.edges(data=True)}
nx.draw_networkx_edge_labels(G , pos, edge_labels=edge_labels)
nx.drawing.nx_pydot.write_dot(G, 'customer_markov.dot')
Nodes:
{G.nodes()}
Edges:
OutMultiEdgeDataView([('DM', 'WEB', {'label': '0.12', 'weight': '0.12'}), ('DM', 'EM', {'label': '0.09', 'weight': '0.09'}), ('DM', 'null', {'label': '0.57', 'weight': '0.57'}), ('DM', 'PHONE', {'label': '0.01', 'weight': '0.01'}), ('DM', 'conversion', {'label': '0.21', 'weight': '0.21'}), ('start', 'EM', {'label': '0.29', 'weight': '0.29'}), ('start', 'DM', {'label': '0.14', 'weight': '0.14'}), ('start', 'PHONE', {'label': '0.32', 'weight': '0.32'}), ('start', 'WEB', {'label': '0.25', 'weight': '0.25'}), ('EM', 'DM', {'label': '0.43', 'weight': '0.43'}), ('EM', 'WEB', {'label': '0.41', 'weight': '0.41'}), ('EM', 'null', {'label': '0.09', 'weight': '0.09'}), ('EM', 'PHONE', {'label': '0.02', 'weight': '0.02'}), ('EM', 'conversion', {'label': '0.04', 'weight': '0.04'}), ('WEB', 'DM', {'label': '0.58', 'weight': '0.58'}), ('WEB', 'EM', {'label': '0.22', 'weight': '0.22'}), ('WEB', 'null', {'label': '0.12', 'weight': '0.12'}), ('WEB', 'PHONE', {'label': '0.03', 'weight': '0.03'}), ('WEB', 'conversion', {'label': '0.05', 'weight': '0.05'}), ('PHONE', 'WEB', {'label': '0.69', 'weight': '0.69'}), ('PHONE', 'DM', {'label': '0.18', 'weight': '0.18'}), ('PHONE', 'null', {'label': '0.01', 'weight': '0.01'}), ('PHONE', 'EM', {'label': '0.11', 'weight': '0.11'}), ('PHONE', 'conversion', {'label': '0.01', 'weight': '0.01'})])