pandas 實現rfm模型


import pandas as pd
import numpy as np


df = pd.read_csv('./zue_164466.csv')

df['ptdate'] = pd.to_datetime(df['ptdate'],format='%Y-%m-%d')
df['dateDiff'] = pd.to_datetime('today')-df['ptdate']
df['dateDiff'] = df['dateDiff'].dt.days

R_Agg = df.groupby(by=['user_email','product_name'])['dateDiff'].agg({'RecencyAgg': np.min})

F_Agg = df.groupby(by=['user_email','product_name'])['ptdate'].agg({'FrequencyAgg': np.size})

M_Agg = df.groupby(by=['user_email','product_name'])['totalcost'].agg({'MonetaryAgg': np.sum})

aggData = R_Agg.join(F_Agg).join(M_Agg)

bins = aggData.RecencyAgg.quantile(q=[0, 0.2, 0.4, 0.6, 0.8, 1],interpolation='nearest')
bins[0] = 0
labels = [5, 4, 3, 2, 1]
R_S = pd.cut(aggData.RecencyAgg, bins, labels=labels)

#
bins = aggData.FrequencyAgg.quantile(q=[0, 0.2, 0.4, 0.6, 0.8, 1],interpolation='nearest')
bins[0] = 0
labels = [1, 2, 3, 4, 5]
F_S = pd.cut(aggData.FrequencyAgg, bins, labels=labels)


bins = aggData.MonetaryAgg.quantile(q=[0, 0.2, 0.4, 0.6, 0.8, 1],interpolation='nearest')
bins[0] = 0
labels = [1, 2, 3, 4, 5]
M_S = pd.cut(aggData.MonetaryAgg,bins, labels=labels)


aggData['R_S']=R_S
aggData['F_S']=F_S
aggData['M_S']=M_S


aggData['RFM'] = 100*R_S.astype(int) + 10*F_S.astype(int) + 1*M_S.astype(int)


bins = aggData.RFM.quantile(q=[0, 0.125, 0.25, 0.375, 0.5, 0.625, 0.75, 0.875, 1],interpolation='nearest')
bins[0] = 0
labels = [1, 2, 3, 4, 5, 6, 7, 8]
aggData['level'] = pd.cut(aggData.RFM, bins, labels=labels)

aggData = aggData.reset_index()

fe = aggData.sort_values(['level', 'RFM'], ascending=[0, 0])

dd=aggData.groupby(by=['level'])['user_email','product_name'].agg({'size':np.size})

print(fe.head())
fe.to_csv('./rfm_data.csv',index=False)
print("---------------")
print(dd)

 

 

 

# -*- coding: utf-8 -*-

'''
描述:案例-基於RFM的用戶價值度模型
程序輸入:sales.csv
程序輸出:RFM得分數據寫本地文件sales_rfm_score.csv和數據表(sales_rfm_score)
'''
# 導入庫
import time  # 導入時間庫
import numpy as np  # 導入numpy庫
import pandas as pd  # 導入pandas庫
import mysql.connector  # 導入mysql連接庫

# 讀取數據
dtypes = {'ORDERDATE': object, 'ORDERID': object, 'AMOUNTINFO': np.float32}  # 設置每列數據類型
raw_data = pd.read_csv('sales.csv', dtype=dtypes, index_col='USERID')  # 讀取數據文件

# 數據審查和校驗
# 數據概覽
print ('Data Overview:')
print (raw_data.head(4))  # 打印原始數據前4條
print ('-' * 30)
print ('Data DESC:')
print (raw_data.describe())  # 打印原始數據基本描述性信息
print ('-' * 60)

# 缺失值審查
na_cols = raw_data.isnull().any(axis=0)  # 查看每一列是否具有缺失值
print ('NA Cols:')
print (na_cols)  # 查看具有缺失值的列
print ('-' * 30)
na_lines = raw_data.isnull().any(axis=1)  # 查看每一行是否具有缺失值
print ('NA Recors:')
print ('Total number of NA lines is: {0}'.format(na_lines.sum()))  # 查看具有缺失值的行總記錄數
print (raw_data[na_lines])  # 只查看具有缺失值的行信息
print ('-' * 60)

# 數據異常、格式轉換和處理
# 異常值處理
sales_data = raw_data.dropna()  # 丟棄帶有缺失值的行記錄
sales_data = sales_data[sales_data['AMOUNTINFO'] > 1]  # 丟棄訂單金額<=1的記錄

# 日期格式轉換
sales_data['ORDERDATE'] = pd.to_datetime(sales_data['ORDERDATE'], format='%Y-%m-%d')  # 將字符串轉換為日期格式
print ('Raw Dtypes:')
print (sales_data.dtypes)  # 打印輸出數據框所有列的數據類型
print ('-' * 60)

# 數據轉換
recency_value = sales_data['ORDERDATE'].groupby(sales_data.index).max()  # 計算原始最近一次訂單時間
frequency_value = sales_data['ORDERDATE'].groupby(sales_data.index).count()  # 計算原始訂單頻率
monetary_value = sales_data['AMOUNTINFO'].groupby(sales_data.index).sum()  # 計算原始訂單總金額

# 計算RFM得分
# 分別計算R、F、M得分
deadline_date = pd.datetime(2017, 01, 01)  # 指定一個時間節點,用於計算其他時間與該時間的距離
r_interval = (deadline_date - recency_value).dt.days  # 計算R間隔
r_score = pd.cut(r_interval, 5, labels=[5, 4, 3, 2, 1])  # 計算R得分
f_score = pd.cut(frequency_value, 5, labels=[1, 2, 3, 4, 5])  # 計算F得分
m_score = pd.cut(monetary_value, 5, labels=[1, 2, 3, 4, 5])  # 計算M得分

# R、F、M數據合並
rfm_list = [r_score, f_score, m_score]  # 將r、f、m三個維度組成列表
rfm_cols = ['r_score', 'f_score', 'm_score']  # 設置r、f、m三個維度列名
rfm_pd = pd.DataFrame(np.array(rfm_list).transpose(), dtype=np.int32, columns=rfm_cols,
                      index=frequency_value.index)  # 建立r、f、m數據框
print ('RFM Score Overview:')
print (rfm_pd.head(4))
print ('-' * 60)

# 計算RFM總得分
# 方法一:加權得分
rfm_pd['rfm_wscore'] = rfm_pd['r_score'] * 0.6 + rfm_pd['f_score'] * 0.3 + rfm_pd['m_score'] * 0.1
# 方法二:RFM組合
rfm_pd_tmp = rfm_pd.copy()
rfm_pd_tmp['r_score'] = rfm_pd_tmp['r_score'].astype('string')
rfm_pd_tmp['f_score'] = rfm_pd_tmp['f_score'].astype('string')
rfm_pd_tmp['m_score'] = rfm_pd_tmp['m_score'].astype('string')
rfm_pd['rfm_comb'] = rfm_pd_tmp['r_score'].str.cat(rfm_pd_tmp['f_score']).str.cat(rfm_pd_tmp['m_score'])

# 打印輸出和保存結果
# 打印結果
print ('Final RFM Scores Overview:')
print (rfm_pd.head(4))  # 打印數據前4項結果
print ('-' * 30)
print ('Final RFM Scores DESC:')
print (rfm_pd.describe())

# 保存RFM得分到本地文件
rfm_pd.to_csv('sales_rfm_score.csv')  # 保存數據為csv

# 保存RFM得分到MySQL數據庫
# 設置要寫庫的數據庫連接信息
table_name = 'sales_rfm_score'  # 要寫庫的表名
# 數據庫基本信息
config = {'host': '127.0.0.1',  # 默認127.0.0.1
          'user': 'root',  # 用戶名
          'password': '123456',  # 密碼
          'port': 3306,  # 端口,默認為3306
          'database': 'python_data',  # 數據庫名稱
          'charset': 'gb2312'  # 字符編碼
          }
con = mysql.connector.connect(**config)  # 建立mysql連接
cursor = con.cursor()  # 獲得游標
# 查找數據庫是否存在目標表,如果沒有則新建
cursor.execute("show tables")  #
table_object = cursor.fetchall()  # 通過fetchall方法獲得所有數據
table_list = []  # 創建庫列表
for t in table_object:  # 循環讀出所有庫
    table_list.append(t[0])  # 每個每個庫追加到列表
if not table_name in table_list:  # 如果目標表沒有創建
    cursor.execute('''
    CREATE TABLE %s (
    userid               VARCHAR(20),
    r_score               int(2),
    f_score              int(2),
    m_score              int(2),
    rfm_wscore              DECIMAL(10,2),
    rfm_comb              VARCHAR(10),
    insert_date              VARCHAR(20)
    )ENGINE=InnoDB DEFAULT CHARSET=gb2312
    ''' % table_name)  # 創建新表
# 將數據寫入數據庫
user_id = rfm_pd.index  # 索引列
rfm_wscore = rfm_pd['rfm_wscore']  # RFM加權得分列
rfm_comb = rfm_pd['rfm_comb']  # RFM組合得分列
timestamp = time.strftime('%Y-%m-%d', time.localtime(time.time()))  # 寫庫日期
print ('Begin to insert data into table {0}...'.format(table_name))  # 輸出開始寫庫的提示信息
for i in range(rfm_pd.shape[0]):  # 設置循環次數並依次循環
    insert_sql = "INSERT INTO `%s` VALUES ('%s',%s,%s,%s,%s,'%s','%s')" % \
                 (table_name, user_id[i], r_score.iloc[i], f_score.iloc[i], m_score.iloc[i], rfm_wscore.iloc[i],
                  rfm_comb.iloc[i], timestamp)  # 寫庫SQL依據
    cursor.execute(insert_sql)  # 執行SQL語句,execute函數里面要用雙引號
    con.commit()  # 提交命令
cursor.close()  # 關閉游標
con.close()  # 關閉數據庫連接
print ('Finish inserting, total records is: %d' % (i + 1))  # 打印寫庫結果

 第三版

import pandas as pd
import numpy as np


df = pd.read_excel('./zue_edu.xlsx',sheet_name='Q2未消費的',index_col='user_id')
df['ptdate'] = pd.to_datetime(df['ptdate'],format='%Y-%m-%d')

recency_value = df['ptdate'].groupby(df['product_name']).max()  # 計算原始最近一次訂單時間
r_interval = (pd.to_datetime('2019-05-06',format='%Y-%m-%d') - recency_value).dt.days  # 計算R間隔
frequency_value = df['ptdate'].groupby([df['user_email'],df['product_name']]).count()  # 計算原始訂單頻率
monetary_value = df['cost'].groupby(df['product_name']).sum()  # 計算原始訂單總金額
frequency_value = frequency_value.groupby('product_name').max()

data_list = [recency_value,r_interval, frequency_value, monetary_value]
data_pd = pd.DataFrame(np.array(data_list).transpose(), columns=['上次消費時間','已停投天數', '有消費天數', '消費金額'] ,
                      index=frequency_value.index)  # 建立原始數據框


# 計算RFM得分
# 分別計算R、F、M得分
r_score = pd.cut(r_interval, 5, labels=[5, 4, 3, 2, 1])  # 計算R得分
f_score = pd.cut(frequency_value, 5, labels=[1, 2, 3, 4, 5])  # 計算F得分
m_score = pd.cut(monetary_value, 5, labels=[1, 2, 3, 4, 5])  # 計算M得分

# R、F、M數據合並
rfm_list = [r_score, f_score, m_score]  # 將r、f、m三個維度組成列表
rfm_cols = ['r_score', 'f_score', 'm_score']  # 設置r、f、m三個維度列名
rfm_pd = pd.DataFrame(np.array(rfm_list).transpose(), dtype=np.int32, columns=rfm_cols,
                      index=frequency_value.index)  # 建立r、f、m數據框

# 計算RFM總得分
# 方法一:加權得分
rfm_pd['rfm_wscore'] = rfm_pd['r_score'] * 0.6 + rfm_pd['f_score'] * 0.3 + rfm_pd['m_score'] * 0.1

# 方法二:RFM組合
rfm_pd_tmp = rfm_pd.copy()
rfm_pd_tmp['r_score'] = rfm_pd_tmp['r_score'].astype('str')
rfm_pd_tmp['f_score'] = rfm_pd_tmp['f_score'].astype('str')
rfm_pd_tmp['m_score'] = rfm_pd_tmp['m_score'].astype('str')
rfm_pd['rfm_comb'] = rfm_pd_tmp['r_score'].str.cat(rfm_pd_tmp['f_score']).str.cat(rfm_pd_tmp['m_score'])

rfm_pd['rfm_comb'] = rfm_pd['rfm_comb'].astype('int')

rfm_pd['最近級別'] = rfm_pd['r_score'].replace({5:'',4:'',3:'一般',2:'',1:'非常差'})
rfm_pd['頻次'] = rfm_pd['f_score'].replace({5:'',4:'',3:'一般',2:'',1:'非常差'})
rfm_pd['金額'] = rfm_pd['m_score'].replace({5:'',4:'',3:'一般',2:'',1:'非常差'})

bins = rfm_pd.rfm_wscore.quantile(q=[0, 0.125, 0.25, 0.375, 0.5, 0.625, 0.75, 0.875, 1],interpolation='nearest')
bins[0] = 0  #起點為0
labels = [1, 2, 3, 4, 5, 6, 7, 8]
rfm_pd['level'] = pd.cut(rfm_pd.rfm_wscore, bins, labels=labels)

rfm_pd = rfm_pd.reset_index()
data_pd = data_pd.reset_index()
fe = rfm_pd.sort_values('rfm_comb', ascending=0)
fina = fe.merge(data_pd,on='product_name')

# 打印結果
print(fina.head())
fina.to_excel('./rfm_edu_Q2non.xlsx',index=False)

 


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM