本章節主要說的遷移學習的事情,遷移學習簡單來說:舉個例子,新產品上線,建模使用其他產品和少量新產品的數據建模,不碼字,具體百度吧。
全部代碼:
# -*- coding: utf-8 -*- """ Created on Tue Dec 24 15:25:58 2019 @author: zixing.mei """ import pandas as pd from sklearn.metrics import roc_auc_score,roc_curve,auc from sklearn.model_selection import train_test_split from sklearn import metrics from sklearn.linear_model import LogisticRegression from sklearn.svm import LinearSVC import numpy as np import random import math from sklearn.calibration import CalibratedClassifierCV #概率矯正方法 data = pd.read_excel('D:/陳桂梅/學習資料/64353 智能風控_源碼及數據_2010/智能風控(數據集)/tra_sample.xlsx') data.head() feature_lst = ['zx_score','msg_cnt','phone_num_cnt','register_days'] train = data[data.type == 'target'].reset_index().copy() #目標域 diff = data[data.type == 'origin'].reset_index().copy() #源域 val = data[data.type == 'offtime'].reset_index().copy() #oot ''' TrainS 目標域樣本 TrainA 源域樣本 LabelS 目標域標簽 LabelA 源域標簽 ''' train = train.loc[:1200] #取前面1200行 trans_S = train[feature_lst].copy() label_S = train['bad_ind'].copy() trans_A = diff[feature_lst].copy() label_A = diff['bad_ind'].copy() val_x = val[feature_lst].copy() val_y = val['bad_ind'].copy() test = val_x.copy() #%% 使用目標域數據建模 ,ks相差10%,roc曲線不平滑,說明模型泛化能力差 lr_model = LogisticRegression(C=0.1,class_weight = 'balanced',solver = 'liblinear') lr_model.fit(trans_S,label_S) y_pred = lr_model.predict_proba(trans_S)[:,1] fpr_lr_train,tpr_lr_train,_ = roc_curve(label_S,y_pred) train_ks = abs(fpr_lr_train - tpr_lr_train).max() print('train_ks : ',train_ks) #train_ks : 0.48500238435860754 y_pred = lr_model.predict_proba(test)[:,1] fpr_lr,tpr_lr,_ = roc_curve(val_y,y_pred) val_ks = abs(fpr_lr - tpr_lr).max() print('val_ks : ',val_ks) #val_ks : 0.3887057754389137 from matplotlib import pyplot as plt plt.plot(fpr_lr_train,tpr_lr_train,label = 'train LR') plt.plot(fpr_lr,tpr_lr,label = 'evl LR') plt.plot([0,1],[0,1],'k--') plt.xlabel('False positive rate') plt.ylabel('True positive rate') plt.title('ROC Curve') plt.legend(loc = 'best') plt.show() trans_data = np.concatenate((trans_A, trans_S), axis=0) trans_label = np.concatenate((label_A, label_S), axis=0) #%%使用源域數據建模 ,ks不穩定,roc不穩定,泛化能力非常差 lr_model = LogisticRegression(C=0.3,class_weight = 'balanced',solver = 'liblinear') lr_model.fit(trans_A,label_A) y_pred = lr_model.predict_proba(trans_data)[:,1] fpr_lr_train,tpr_lr_train,_ = roc_curve(trans_label,y_pred) train_ks = abs(fpr_lr_train - tpr_lr_train).max() print('train_ks : ',train_ks) #train_ks : 0.4910909493184976 y_pred = lr_model.predict_proba(test)[:,1] fpr_lr,tpr_lr,_ = roc_curve(val_y,y_pred) val_ks = abs(fpr_lr - tpr_lr).max() print('val_ks : ',val_ks) #val_ks : 0.33077621830414 from matplotlib import pyplot as plt plt.plot(fpr_lr_train,tpr_lr_train,label = 'train LR') plt.plot(fpr_lr,tpr_lr,label = 'evl LR') plt.plot([0,1],[0,1],'k--') plt.xlabel('False positive rate') plt.ylabel('True positive rate') plt.title('ROC Curve') plt.legend(loc = 'best') plt.show() #%%將源域和目標域的數據整合在一起 import numpy as np import pandas as pd from sklearn.linear_model import LogisticRegression from sklearn.metrics import roc_curve def Tr_lr_boost(trans_A,trans_S,label_A,label_S,test,label_test, N=500,early_stopping_rounds =100): """ 邏輯回歸的學習率、權重的大小,影響整體收斂的快慢 H 測試樣本分類結果 TrainS 目標域樣本 TrainA 源域樣本 LabelS 目標域標簽 LabelA 源域標簽 Test 測試樣本 N 迭代次數 early_stopping_rounds 提前停止輪次 """ #計算weight def calculate_P(weights, label): total = np.sum(weights) return np.asarray(weights / total, order='C') #用邏輯回歸作為基分類器,輸出概率 def train_classify(trans_data, trans_label, test_data, P): clf = LogisticRegression(C=0.3,class_weight = 'balanced',solver='liblinear') clf.fit(trans_data, trans_label, sample_weight=P[:, 0]) return clf.predict_proba(test_data)[:,1],clf #計算在目標域上面的錯誤率 def calculate_error_rate(label_R, label_H, weight): total = np.sum(weight) return np.sum(weight[:, 0] / total * np.abs(label_R - label_H)) #根據邏輯回歸輸出的score的得到標簽,注意這里不能用predict直接輸出標簽 def put_label(score_H,thred): new_label_H = [] for i in score_H: if i <= thred: new_label_H.append(0) else: new_label_H.append(1) return new_label_H #指定迭代次數,相當於集成模型中基模型的數量 #拼接數據集 trans_data = np.concatenate((trans_A, trans_S), axis=0) trans_label = np.concatenate((label_A, label_S), axis=0) #三個數據集樣本數 row_A = trans_A.shape[0] row_S = trans_S.shape[0] row_T = test.shape[0] #三個數據集合並為打分數據集 test_data = np.concatenate((trans_data, test), axis=0) # 初始化權重 weights_A = np.ones([row_A, 1])/row_A weights_S = np.ones([row_S, 1])/row_S*2 # 目標域的權重會比源域的權重高一些 weights = np.concatenate((weights_A, weights_S), axis=0) #按照公式初始化beta值 bata = 1 / (1 + np.sqrt(2 * np.log(row_A / N))) #N是多少,N是迭代次數 # 存每一次迭代的bata值=error_rate / (1 - error_rate) bata_T = np.zeros([1, N]) # 存儲每次迭代的標簽 result_label = np.ones([row_A + row_S + row_T, N]) trans_data = np.asarray(trans_data, order='C') trans_label = np.asarray(trans_label, order='C') test_data = np.asarray(test_data, order='C') #最優KS best_ks = -1 #最優基模型數量 best_round = -1 #最優模型 best_model = -1 """ 初始化結束 正式開始訓練 """ for i in range(N): P = calculate_P(weights, trans_label) result_label[:, i],model = train_classify(trans_data, trans_label, test_data, P) score_H = result_label[row_A:row_A + row_S, i] pctg = np.sum(trans_label)/len(trans_label) thred = pd.DataFrame(score_H).quantile(1-pctg)[0] label_H = put_label(score_H,thred) #計算在目標域上的錯誤率 error_rate = calculate_error_rate(label_S, label_H, weights[row_A:row_A + row_S, :]) # 防止過擬合 if error_rate > 0.5: error_rate = 0.5 if error_rate == 0: N = i break bata_T[0, i] = error_rate / (1 - error_rate) # 調整目標域樣本權重 for j in range(row_S): weights[row_A + j] = weights[row_A + j] * np.power(bata_T[0, i], \ (-np.abs(result_label[row_A + j, i] - label_S[j]))) # 調整源域樣本權重 for j in range(row_A): weights[j] = weights[j] * np.power(bata, np.abs(result_label[j, i] - label_A[j])) y_pred = result_label[(row_A + row_S):,i] fpr_lr_train,tpr_lr_train,_ = roc_curve(label_test,y_pred) train_ks = abs(fpr_lr_train - tpr_lr_train).max() print('test_ks : ',train_ks,'當前第',i+1,'輪') # 不再使用后一半學習器投票,而是只保留效果最好的邏輯回歸模型 if train_ks > best_ks : best_ks = train_ks best_round = i best_model = model # 當超過eadrly_stopping_rounds輪KS不再提升后,停止訓練 if best_round < i - early_stopping_rounds: break return best_ks,best_round,best_model # 訓練並得到最優模型best_model best_ks,best_round,best_model = Tr_lr_boost(trans_A,trans_S,label_A,label_S, test,label_test=val_y,N=300, early_stopping_rounds=20) y_pred = best_model.predict_proba(trans_S)[:,1] fpr_lr_train,tpr_lr_train,_ = roc_curve(label_S,y_pred) train_ks = abs(fpr_lr_train - tpr_lr_train).max() print('train_ks : ',train_ks) #train_ks : 0.4629947544110634 y_pred = best_model.predict_proba(test)[:,1] fpr_lr,tpr_lr,_ = roc_curve(val_y,y_pred) val_ks = abs(fpr_lr - tpr_lr).max() print('val_ks : ',val_ks) # val_ks : 0.39846160021324123 from matplotlib import pyplot as plt plt.plot(fpr_lr_train,tpr_lr_train,label = 'train LR') plt.plot(fpr_lr,tpr_lr,label = 'evl LR') plt.plot([0,1],[0,1],'k--') plt.xlabel('False positive rate') plt.ylabel('True positive rate') plt.title('ROC Curve') plt.legend(loc = 'best') plt.show()
感覺遷移學習,很多公司不采用,比較難辦
一、數據准備
主要是划分目標域,源域,oot樣本
import pandas as pd from sklearn.metrics import roc_auc_score,roc_curve,auc from sklearn.model_selection import train_test_split from sklearn import metrics from sklearn.linear_model import LogisticRegression from sklearn.svm import LinearSVC import numpy as np import random import math from sklearn.calibration import CalibratedClassifierCV #概率矯正方法 data = pd.read_excel('xxx/tra_sample.xlsx') data.head() feature_lst = ['zx_score','msg_cnt','phone_num_cnt','register_days'] train = data[data.type == 'target'].reset_index().copy() #目標域 diff = data[data.type == 'origin'].reset_index().copy() #源域 val = data[data.type == 'offtime'].reset_index().copy() #oot ''' TrainS 目標域樣本 TrainA 源域樣本 LabelS 目標域標簽 LabelA 源域標簽 ''' train = train.loc[:1200] #取前面1200行 trans_S = train[feature_lst].copy() label_S = train['bad_ind'].copy() trans_A = diff[feature_lst].copy() label_A = diff['bad_ind'].copy() val_x = val[feature_lst].copy() val_y = val['bad_ind'].copy() test = val_x.copy()
二、目標域建模
使用目標域建模,然后使用oot測試
#%% 使用目標域數據建模 ,ks相差10%,roc曲線不平滑,說明模型泛化能力差 lr_model = LogisticRegression(C=0.1,class_weight = 'balanced',solver = 'liblinear') lr_model.fit(trans_S,label_S) y_pred = lr_model.predict_proba(trans_S)[:,1] fpr_lr_train,tpr_lr_train,_ = roc_curve(label_S,y_pred) train_ks = abs(fpr_lr_train - tpr_lr_train).max() print('train_ks : ',train_ks) #train_ks : 0.48500238435860754 y_pred = lr_model.predict_proba(test)[:,1] fpr_lr,tpr_lr,_ = roc_curve(val_y,y_pred) val_ks = abs(fpr_lr - tpr_lr).max() print('val_ks : ',val_ks) #val_ks : 0.3887057754389137 from matplotlib import pyplot as plt plt.plot(fpr_lr_train,tpr_lr_train,label = 'train LR') plt.plot(fpr_lr,tpr_lr,label = 'evl LR') plt.plot([0,1],[0,1],'k--') plt.xlabel('False positive rate') plt.ylabel('True positive rate') plt.title('ROC Curve') plt.legend(loc = 'best') plt.show() trans_data = np.concatenate((trans_A, trans_S), axis=0) trans_label = np.concatenate((label_A, label_S), axis=0)
ks相差10%,roc曲線不平滑,說明模型泛化能力差

三、使用源域建模
使用源域建模,然后使用oot測試
#%%使用源域數據建模 ,ks不穩定,roc不穩定,泛化能力非常差 lr_model = LogisticRegression(C=0.3,class_weight = 'balanced',solver = 'liblinear') lr_model.fit(trans_A,label_A) y_pred = lr_model.predict_proba(trans_data)[:,1] fpr_lr_train,tpr_lr_train,_ = roc_curve(trans_label,y_pred) train_ks = abs(fpr_lr_train - tpr_lr_train).max() print('train_ks : ',train_ks) #train_ks : 0.4910909493184976 y_pred = lr_model.predict_proba(test)[:,1] fpr_lr,tpr_lr,_ = roc_curve(val_y,y_pred) val_ks = abs(fpr_lr - tpr_lr).max() print('val_ks : ',val_ks) #val_ks : 0.33077621830414 from matplotlib import pyplot as plt plt.plot(fpr_lr_train,tpr_lr_train,label = 'train LR') plt.plot(fpr_lr,tpr_lr,label = 'evl LR') plt.plot([0,1],[0,1],'k--') plt.xlabel('False positive rate') plt.ylabel('True positive rate') plt.title('ROC Curve') plt.legend(loc = 'best') plt.show()
ks不穩定,roc不穩定,泛化能力非常差

四、使用源域+目標域根據不同權重建模
使用源域+目標域,不斷調整錯分的樣本的權重,使用目標域、oot測試
#%%將源域和目標域的數據整合在一起 import numpy as np import pandas as pd from sklearn.linear_model import LogisticRegression from sklearn.metrics import roc_curve def Tr_lr_boost(trans_A,trans_S,label_A,label_S,test,label_test, N=500,early_stopping_rounds =100): """ 邏輯回歸的學習率、權重的大小,影響整體收斂的快慢 H 測試樣本分類結果 TrainS 目標域樣本 TrainA 源域樣本 LabelS 目標域標簽 LabelA 源域標簽 Test 測試樣本 N 迭代次數 early_stopping_rounds 提前停止輪次 """ #計算weight def calculate_P(weights, label): total = np.sum(weights) return np.asarray(weights / total, order='C') #用邏輯回歸作為基分類器,輸出概率 def train_classify(trans_data, trans_label, test_data, P): clf = LogisticRegression(C=0.3,class_weight = 'balanced',solver='liblinear') clf.fit(trans_data, trans_label, sample_weight=P[:, 0]) return clf.predict_proba(test_data)[:,1],clf #計算在目標域上面的錯誤率 def calculate_error_rate(label_R, label_H, weight): total = np.sum(weight) return np.sum(weight[:, 0] / total * np.abs(label_R - label_H)) #根據邏輯回歸輸出的score的得到標簽,注意這里不能用predict直接輸出標簽 def put_label(score_H,thred): new_label_H = [] for i in score_H: if i <= thred: new_label_H.append(0) else: new_label_H.append(1) return new_label_H #指定迭代次數,相當於集成模型中基模型的數量 #拼接數據集 trans_data = np.concatenate((trans_A, trans_S), axis=0) trans_label = np.concatenate((label_A, label_S), axis=0) #三個數據集樣本數 row_A = trans_A.shape[0] row_S = trans_S.shape[0] row_T = test.shape[0] #三個數據集合並為打分數據集 test_data = np.concatenate((trans_data, test), axis=0) # 初始化權重 weights_A = np.ones([row_A, 1])/row_A weights_S = np.ones([row_S, 1])/row_S*2 # 目標域的權重會比源域的權重高一些 weights = np.concatenate((weights_A, weights_S), axis=0) #按照公式初始化beta值 bata = 1 / (1 + np.sqrt(2 * np.log(row_A / N))) #N是多少,N是迭代次數 # 存每一次迭代的bata值=error_rate / (1 - error_rate) bata_T = np.zeros([1, N]) # 存儲每次迭代的標簽 result_label = np.ones([row_A + row_S + row_T, N]) trans_data = np.asarray(trans_data, order='C') trans_label = np.asarray(trans_label, order='C') test_data = np.asarray(test_data, order='C') #最優KS best_ks = -1 #最優基模型數量 best_round = -1 #最優模型 best_model = -1 """ 初始化結束 正式開始訓練 """ for i in range(N): P = calculate_P(weights, trans_label) result_label[:, i],model = train_classify(trans_data, trans_label, test_data, P) score_H = result_label[row_A:row_A + row_S, i] pctg = np.sum(trans_label)/len(trans_label) thred = pd.DataFrame(score_H).quantile(1-pctg)[0] label_H = put_label(score_H,thred) #計算在目標域上的錯誤率 error_rate = calculate_error_rate(label_S, label_H, weights[row_A:row_A + row_S, :]) # 防止過擬合 if error_rate > 0.5: error_rate = 0.5 if error_rate == 0: N = i break bata_T[0, i] = error_rate / (1 - error_rate) # 調整目標域樣本權重 for j in range(row_S): weights[row_A + j] = weights[row_A + j] * np.power(bata_T[0, i], \ (-np.abs(result_label[row_A + j, i] - label_S[j]))) # 調整源域樣本權重 for j in range(row_A): weights[j] = weights[j] * np.power(bata, np.abs(result_label[j, i] - label_A[j])) y_pred = result_label[(row_A + row_S):,i] fpr_lr_train,tpr_lr_train,_ = roc_curve(label_test,y_pred) train_ks = abs(fpr_lr_train - tpr_lr_train).max() print('test_ks : ',train_ks,'當前第',i+1,'輪') # 不再使用后一半學習器投票,而是只保留效果最好的邏輯回歸模型 if train_ks > best_ks : best_ks = train_ks best_round = i best_model = model # 當超過eadrly_stopping_rounds輪KS不再提升后,停止訓練 if best_round < i - early_stopping_rounds: break return best_ks,best_round,best_model # 訓練並得到最優模型best_model best_ks,best_round,best_model = Tr_lr_boost(trans_A,trans_S,label_A,label_S, test,label_test=val_y,N=300, early_stopping_rounds=20) y_pred = best_model.predict_proba(trans_S)[:,1] fpr_lr_train,tpr_lr_train,_ = roc_curve(label_S,y_pred) train_ks = abs(fpr_lr_train - tpr_lr_train).max() print('train_ks : ',train_ks) #train_ks : 0.4629947544110634 y_pred = best_model.predict_proba(test)[:,1] fpr_lr,tpr_lr,_ = roc_curve(val_y,y_pred) val_ks = abs(fpr_lr - tpr_lr).max() print('val_ks : ',val_ks) # val_ks : 0.39846160021324123 from matplotlib import pyplot as plt plt.plot(fpr_lr_train,tpr_lr_train,label = 'train LR') plt.plot(fpr_lr,tpr_lr,label = 'evl LR') plt.plot([0,1],[0,1],'k--') plt.xlabel('False positive rate') plt.ylabel('True positive rate') plt.title('ROC Curve') plt.legend(loc = 'best') plt.show()

至於結果為什么和梅老師的結果不太一樣,可能是數據的原因
