import pandas as pd import numpy as np df = pd.read_csv('./zue_164466.csv') df['ptdate'] = pd.to_datetime(df['ptdate'],format='%Y-%m-%d') df['dateDiff'] = pd.to_datetime('today')-df['ptdate'] df['dateDiff'] = df['dateDiff'].dt.days R_Agg = df.groupby(by=['user_email','product_name'])['dateDiff'].agg({'RecencyAgg': np.min}) F_Agg = df.groupby(by=['user_email','product_name'])['ptdate'].agg({'FrequencyAgg': np.size}) M_Agg = df.groupby(by=['user_email','product_name'])['totalcost'].agg({'MonetaryAgg': np.sum}) aggData = R_Agg.join(F_Agg).join(M_Agg) bins = aggData.RecencyAgg.quantile(q=[0, 0.2, 0.4, 0.6, 0.8, 1],interpolation='nearest') bins[0] = 0 labels = [5, 4, 3, 2, 1] R_S = pd.cut(aggData.RecencyAgg, bins, labels=labels) # bins = aggData.FrequencyAgg.quantile(q=[0, 0.2, 0.4, 0.6, 0.8, 1],interpolation='nearest') bins[0] = 0 labels = [1, 2, 3, 4, 5] F_S = pd.cut(aggData.FrequencyAgg, bins, labels=labels) bins = aggData.MonetaryAgg.quantile(q=[0, 0.2, 0.4, 0.6, 0.8, 1],interpolation='nearest') bins[0] = 0 labels = [1, 2, 3, 4, 5] M_S = pd.cut(aggData.MonetaryAgg,bins, labels=labels) aggData['R_S']=R_S aggData['F_S']=F_S aggData['M_S']=M_S aggData['RFM'] = 100*R_S.astype(int) + 10*F_S.astype(int) + 1*M_S.astype(int) bins = aggData.RFM.quantile(q=[0, 0.125, 0.25, 0.375, 0.5, 0.625, 0.75, 0.875, 1],interpolation='nearest') bins[0] = 0 labels = [1, 2, 3, 4, 5, 6, 7, 8] aggData['level'] = pd.cut(aggData.RFM, bins, labels=labels) aggData = aggData.reset_index() fe = aggData.sort_values(['level', 'RFM'], ascending=[0, 0]) dd=aggData.groupby(by=['level'])['user_email','product_name'].agg({'size':np.size}) print(fe.head()) fe.to_csv('./rfm_data.csv',index=False) print("---------------") print(dd)
# -*- coding: utf-8 -*- ''' 描述:案例-基於RFM的用戶價值度模型 程序輸入:sales.csv 程序輸出:RFM得分數據寫本地文件sales_rfm_score.csv和數據表(sales_rfm_score) ''' # 導入庫 import time # 導入時間庫 import numpy as np # 導入numpy庫 import pandas as pd # 導入pandas庫 import mysql.connector # 導入mysql連接庫 # 讀取數據 dtypes = {'ORDERDATE': object, 'ORDERID': object, 'AMOUNTINFO': np.float32} # 設置每列數據類型 raw_data = pd.read_csv('sales.csv', dtype=dtypes, index_col='USERID') # 讀取數據文件 # 數據審查和校驗 # 數據概覽 print ('Data Overview:') print (raw_data.head(4)) # 打印原始數據前4條 print ('-' * 30) print ('Data DESC:') print (raw_data.describe()) # 打印原始數據基本描述性信息 print ('-' * 60) # 缺失值審查 na_cols = raw_data.isnull().any(axis=0) # 查看每一列是否具有缺失值 print ('NA Cols:') print (na_cols) # 查看具有缺失值的列 print ('-' * 30) na_lines = raw_data.isnull().any(axis=1) # 查看每一行是否具有缺失值 print ('NA Recors:') print ('Total number of NA lines is: {0}'.format(na_lines.sum())) # 查看具有缺失值的行總記錄數 print (raw_data[na_lines]) # 只查看具有缺失值的行信息 print ('-' * 60) # 數據異常、格式轉換和處理 # 異常值處理 sales_data = raw_data.dropna() # 丟棄帶有缺失值的行記錄 sales_data = sales_data[sales_data['AMOUNTINFO'] > 1] # 丟棄訂單金額<=1的記錄 # 日期格式轉換 sales_data['ORDERDATE'] = pd.to_datetime(sales_data['ORDERDATE'], format='%Y-%m-%d') # 將字符串轉換為日期格式 print ('Raw Dtypes:') print (sales_data.dtypes) # 打印輸出數據框所有列的數據類型 print ('-' * 60) # 數據轉換 recency_value = sales_data['ORDERDATE'].groupby(sales_data.index).max() # 計算原始最近一次訂單時間 frequency_value = sales_data['ORDERDATE'].groupby(sales_data.index).count() # 計算原始訂單頻率 monetary_value = sales_data['AMOUNTINFO'].groupby(sales_data.index).sum() # 計算原始訂單總金額 # 計算RFM得分 # 分別計算R、F、M得分 deadline_date = pd.datetime(2017, 01, 01) # 指定一個時間節點,用於計算其他時間與該時間的距離 r_interval = (deadline_date - recency_value).dt.days # 計算R間隔 r_score = pd.cut(r_interval, 5, labels=[5, 4, 3, 2, 1]) # 計算R得分 f_score = pd.cut(frequency_value, 5, labels=[1, 2, 3, 4, 5]) # 計算F得分 m_score = pd.cut(monetary_value, 5, labels=[1, 2, 3, 4, 5]) # 計算M得分 # R、F、M數據合並 rfm_list = [r_score, f_score, m_score] # 將r、f、m三個維度組成列表 rfm_cols = ['r_score', 'f_score', 'm_score'] # 設置r、f、m三個維度列名 rfm_pd = pd.DataFrame(np.array(rfm_list).transpose(), dtype=np.int32, columns=rfm_cols, index=frequency_value.index) # 建立r、f、m數據框 print ('RFM Score Overview:') print (rfm_pd.head(4)) print ('-' * 60) # 計算RFM總得分 # 方法一:加權得分 rfm_pd['rfm_wscore'] = rfm_pd['r_score'] * 0.6 + rfm_pd['f_score'] * 0.3 + rfm_pd['m_score'] * 0.1 # 方法二:RFM組合 rfm_pd_tmp = rfm_pd.copy() rfm_pd_tmp['r_score'] = rfm_pd_tmp['r_score'].astype('string') rfm_pd_tmp['f_score'] = rfm_pd_tmp['f_score'].astype('string') rfm_pd_tmp['m_score'] = rfm_pd_tmp['m_score'].astype('string') rfm_pd['rfm_comb'] = rfm_pd_tmp['r_score'].str.cat(rfm_pd_tmp['f_score']).str.cat(rfm_pd_tmp['m_score']) # 打印輸出和保存結果 # 打印結果 print ('Final RFM Scores Overview:') print (rfm_pd.head(4)) # 打印數據前4項結果 print ('-' * 30) print ('Final RFM Scores DESC:') print (rfm_pd.describe()) # 保存RFM得分到本地文件 rfm_pd.to_csv('sales_rfm_score.csv') # 保存數據為csv # 保存RFM得分到MySQL數據庫 # 設置要寫庫的數據庫連接信息 table_name = 'sales_rfm_score' # 要寫庫的表名 # 數據庫基本信息 config = {'host': '127.0.0.1', # 默認127.0.0.1 'user': 'root', # 用戶名 'password': '123456', # 密碼 'port': 3306, # 端口,默認為3306 'database': 'python_data', # 數據庫名稱 'charset': 'gb2312' # 字符編碼 } con = mysql.connector.connect(**config) # 建立mysql連接 cursor = con.cursor() # 獲得游標 # 查找數據庫是否存在目標表,如果沒有則新建 cursor.execute("show tables") # table_object = cursor.fetchall() # 通過fetchall方法獲得所有數據 table_list = [] # 創建庫列表 for t in table_object: # 循環讀出所有庫 table_list.append(t[0]) # 每個每個庫追加到列表 if not table_name in table_list: # 如果目標表沒有創建 cursor.execute(''' CREATE TABLE %s ( userid VARCHAR(20), r_score int(2), f_score int(2), m_score int(2), rfm_wscore DECIMAL(10,2), rfm_comb VARCHAR(10), insert_date VARCHAR(20) )ENGINE=InnoDB DEFAULT CHARSET=gb2312 ''' % table_name) # 創建新表 # 將數據寫入數據庫 user_id = rfm_pd.index # 索引列 rfm_wscore = rfm_pd['rfm_wscore'] # RFM加權得分列 rfm_comb = rfm_pd['rfm_comb'] # RFM組合得分列 timestamp = time.strftime('%Y-%m-%d', time.localtime(time.time())) # 寫庫日期 print ('Begin to insert data into table {0}...'.format(table_name)) # 輸出開始寫庫的提示信息 for i in range(rfm_pd.shape[0]): # 設置循環次數並依次循環 insert_sql = "INSERT INTO `%s` VALUES ('%s',%s,%s,%s,%s,'%s','%s')" % \ (table_name, user_id[i], r_score.iloc[i], f_score.iloc[i], m_score.iloc[i], rfm_wscore.iloc[i], rfm_comb.iloc[i], timestamp) # 寫庫SQL依據 cursor.execute(insert_sql) # 執行SQL語句,execute函數里面要用雙引號 con.commit() # 提交命令 cursor.close() # 關閉游標 con.close() # 關閉數據庫連接 print ('Finish inserting, total records is: %d' % (i + 1)) # 打印寫庫結果
第三版
import pandas as pd import numpy as np df = pd.read_excel('./zue_edu.xlsx',sheet_name='Q2未消費的',index_col='user_id') df['ptdate'] = pd.to_datetime(df['ptdate'],format='%Y-%m-%d') recency_value = df['ptdate'].groupby(df['product_name']).max() # 計算原始最近一次訂單時間 r_interval = (pd.to_datetime('2019-05-06',format='%Y-%m-%d') - recency_value).dt.days # 計算R間隔 frequency_value = df['ptdate'].groupby([df['user_email'],df['product_name']]).count() # 計算原始訂單頻率 monetary_value = df['cost'].groupby(df['product_name']).sum() # 計算原始訂單總金額 frequency_value = frequency_value.groupby('product_name').max() data_list = [recency_value,r_interval, frequency_value, monetary_value] data_pd = pd.DataFrame(np.array(data_list).transpose(), columns=['上次消費時間','已停投天數', '有消費天數', '消費金額'] , index=frequency_value.index) # 建立原始數據框 # 計算RFM得分 # 分別計算R、F、M得分 r_score = pd.cut(r_interval, 5, labels=[5, 4, 3, 2, 1]) # 計算R得分 f_score = pd.cut(frequency_value, 5, labels=[1, 2, 3, 4, 5]) # 計算F得分 m_score = pd.cut(monetary_value, 5, labels=[1, 2, 3, 4, 5]) # 計算M得分 # R、F、M數據合並 rfm_list = [r_score, f_score, m_score] # 將r、f、m三個維度組成列表 rfm_cols = ['r_score', 'f_score', 'm_score'] # 設置r、f、m三個維度列名 rfm_pd = pd.DataFrame(np.array(rfm_list).transpose(), dtype=np.int32, columns=rfm_cols, index=frequency_value.index) # 建立r、f、m數據框 # 計算RFM總得分 # 方法一:加權得分 rfm_pd['rfm_wscore'] = rfm_pd['r_score'] * 0.6 + rfm_pd['f_score'] * 0.3 + rfm_pd['m_score'] * 0.1 # 方法二:RFM組合 rfm_pd_tmp = rfm_pd.copy() rfm_pd_tmp['r_score'] = rfm_pd_tmp['r_score'].astype('str') rfm_pd_tmp['f_score'] = rfm_pd_tmp['f_score'].astype('str') rfm_pd_tmp['m_score'] = rfm_pd_tmp['m_score'].astype('str') rfm_pd['rfm_comb'] = rfm_pd_tmp['r_score'].str.cat(rfm_pd_tmp['f_score']).str.cat(rfm_pd_tmp['m_score']) rfm_pd['rfm_comb'] = rfm_pd['rfm_comb'].astype('int') rfm_pd['最近級別'] = rfm_pd['r_score'].replace({5:'優',4:'良',3:'一般',2:'差',1:'非常差'}) rfm_pd['頻次'] = rfm_pd['f_score'].replace({5:'優',4:'良',3:'一般',2:'差',1:'非常差'}) rfm_pd['金額'] = rfm_pd['m_score'].replace({5:'優',4:'良',3:'一般',2:'差',1:'非常差'}) bins = rfm_pd.rfm_wscore.quantile(q=[0, 0.125, 0.25, 0.375, 0.5, 0.625, 0.75, 0.875, 1],interpolation='nearest') bins[0] = 0 #起點為0 labels = [1, 2, 3, 4, 5, 6, 7, 8] rfm_pd['level'] = pd.cut(rfm_pd.rfm_wscore, bins, labels=labels) rfm_pd = rfm_pd.reset_index() data_pd = data_pd.reset_index() fe = rfm_pd.sort_values('rfm_comb', ascending=0) fina = fe.merge(data_pd,on='product_name') # 打印結果 print(fina.head()) fina.to_excel('./rfm_edu_Q2non.xlsx',index=False)