3.3.py 遷移學習

本文轉載自查看原文 2021-09-30 18:09 120 梅子行-智能風控

本章節主要說的遷移學習的事情，遷移學習簡單來說：舉個例子，新產品上線，建模使用其他產品和少量新產品的數據建模，不碼字，具體百度吧。

全部代碼：

# -*- coding: utf-8 -*-
"""
Created on Tue Dec 24 15:25:58 2019

@author: zixing.mei
"""

import pandas as pd  
from sklearn.metrics import roc_auc_score,roc_curve,auc  
from sklearn.model_selection import train_test_split  
from sklearn import metrics  
from sklearn.linear_model import LogisticRegression  
from sklearn.svm import LinearSVC  
import numpy as np  
import random  
import math  
from sklearn.calibration import CalibratedClassifierCV   #概率矯正方法
data = pd.read_excel('D:/陳桂梅/學習資料/64353 智能風控_源碼及數據_2010/智能風控（數據集）/tra_sample.xlsx')  
data.head()  
feature_lst = ['zx_score','msg_cnt','phone_num_cnt','register_days']    
train = data[data.type == 'target'].reset_index().copy()    #目標域
diff = data[data.type == 'origin'].reset_index().copy()    #源域
val = data[data.type == 'offtime'].reset_index().copy()   #oot 
  
''' 
TrainS 目標域樣本   
TrainA 源域樣本   
LabelS 目標域標簽   
LabelA 源域標簽   
'''  
  
train = train.loc[:1200]   #取前面1200行 
    
trans_S = train[feature_lst].copy()    
label_S = train['bad_ind'].copy()    
    
trans_A = diff[feature_lst].copy()    
label_A = diff['bad_ind'].copy()    
    
val_x =  val[feature_lst].copy()    
val_y = val['bad_ind'].copy()    
    
test = val_x.copy()   


#%% 使用目標域數據建模 ，ks相差10%，roc曲線不平滑，說明模型泛化能力差
lr_model = LogisticRegression(C=0.1,class_weight = 'balanced',solver = 'liblinear')  
lr_model.fit(trans_S,label_S)  
  
y_pred = lr_model.predict_proba(trans_S)[:,1]  
fpr_lr_train,tpr_lr_train,_ = roc_curve(label_S,y_pred)  
train_ks = abs(fpr_lr_train - tpr_lr_train).max()  
print('train_ks : ',train_ks)  #train_ks :  0.48500238435860754
  
y_pred = lr_model.predict_proba(test)[:,1]  
fpr_lr,tpr_lr,_ = roc_curve(val_y,y_pred)  
val_ks = abs(fpr_lr - tpr_lr).max()  
print('val_ks : ',val_ks)  #val_ks :  0.3887057754389137


from matplotlib import pyplot as plt  
plt.plot(fpr_lr_train,tpr_lr_train,label = 'train LR')  
plt.plot(fpr_lr,tpr_lr,label = 'evl LR')  
plt.plot([0,1],[0,1],'k--')  
plt.xlabel('False positive rate')  
plt.ylabel('True positive rate')  
plt.title('ROC Curve')  
plt.legend(loc = 'best')  
plt.show()  
trans_data = np.concatenate((trans_A, trans_S), axis=0)  
trans_label = np.concatenate((label_A, label_S), axis=0)  


#%%使用源域數據建模 ，ks不穩定，roc不穩定，泛化能力非常差
lr_model = LogisticRegression(C=0.3,class_weight = 'balanced',solver = 'liblinear')  
lr_model.fit(trans_A,label_A)  
  
y_pred = lr_model.predict_proba(trans_data)[:,1]  
fpr_lr_train,tpr_lr_train,_ = roc_curve(trans_label,y_pred)  
train_ks = abs(fpr_lr_train - tpr_lr_train).max()  
print('train_ks : ',train_ks)  #train_ks :  0.4910909493184976
  
y_pred = lr_model.predict_proba(test)[:,1]  
fpr_lr,tpr_lr,_ = roc_curve(val_y,y_pred)  
val_ks = abs(fpr_lr - tpr_lr).max()  
print('val_ks : ',val_ks)  #val_ks :  0.33077621830414

from matplotlib import pyplot as plt  
plt.plot(fpr_lr_train,tpr_lr_train,label = 'train LR')  
plt.plot(fpr_lr,tpr_lr,label = 'evl LR')  
plt.plot([0,1],[0,1],'k--')  
plt.xlabel('False positive rate')  
plt.ylabel('True positive rate')  
plt.title('ROC Curve')  
plt.legend(loc = 'best')  
plt.show()

   
#%%將源域和目標域的數據整合在一起   
import numpy as np      
import pandas as pd    
from sklearn.linear_model import LogisticRegression     
from sklearn.metrics import roc_curve  
   
def Tr_lr_boost(trans_A,trans_S,label_A,label_S,test,label_test,
                  N=500,early_stopping_rounds =100):    
    """   
        邏輯回歸的學習率、權重的大小，影響整體收斂的快慢    
        H 測試樣本分類結果    
        TrainS 目標域樣本    
        TrainA 源域樣本    
        LabelS 目標域標簽    
        LabelA 源域標簽    
        Test  測試樣本    
        N 迭代次數   
        early_stopping_rounds 提前停止輪次 
    """   
    #計算weight      
    def calculate_P(weights, label):      
        total = np.sum(weights)      
        return np.asarray(weights / total, order='C')      
          
    #用邏輯回歸作為基分類器，輸出概率      
    def train_classify(trans_data, trans_label, test_data, P):      
        clf = LogisticRegression(C=0.3,class_weight = 'balanced',solver='liblinear')      
        clf.fit(trans_data, trans_label, sample_weight=P[:, 0])      
        return clf.predict_proba(test_data)[:,1],clf      
          
    #計算在目標域上面的錯誤率      
    def calculate_error_rate(label_R, label_H, weight):      
        total = np.sum(weight)      
        return np.sum(weight[:, 0] / total * np.abs(label_R - label_H))      
          
    #根據邏輯回歸輸出的score的得到標簽，注意這里不能用predict直接輸出標簽      
    def put_label(score_H,thred):      
        new_label_H = []      
        for i in score_H:      
            if i <= thred:      
                new_label_H.append(0)      
            else:      
                new_label_H.append(1)      
        return new_label_H      
          
    #指定迭代次數，相當於集成模型中基模型的數量      
         
          
    #拼接數據集    
    trans_data = np.concatenate((trans_A, trans_S), axis=0)      
    trans_label = np.concatenate((label_A, label_S), axis=0)      
        
    #三個數據集樣本數    
    row_A = trans_A.shape[0]      
    row_S = trans_S.shape[0]      
    row_T = test.shape[0]      
        
    #三個數據集合並為打分數據集    
    test_data = np.concatenate((trans_data, test), axis=0)      
          
    # 初始化權重      
    weights_A = np.ones([row_A, 1])/row_A      
    weights_S = np.ones([row_S, 1])/row_S*2  # 目標域的權重會比源域的權重高一些   
    weights = np.concatenate((weights_A, weights_S), axis=0)      
        
    #按照公式初始化beta值    
    bata = 1 / (1 + np.sqrt(2 * np.log(row_A / N)))    #N是多少，N是迭代次數  
          
        
    # 存每一次迭代的bata值=error_rate / (1 - error_rate)      
    bata_T = np.zeros([1, N])      
    # 存儲每次迭代的標簽    
    result_label = np.ones([row_A + row_S + row_T, N])       
          
    trans_data = np.asarray(trans_data, order='C')      
    trans_label = np.asarray(trans_label, order='C')      
    test_data = np.asarray(test_data, order='C')      
        
    #最優KS      
    best_ks = -1      
    #最優基模型數量          
    best_round = -1    
    #最優模型      
    best_model = -1     
         
    """ 
    初始化結束    
    正式開始訓練  
    """     
        
    for i in range(N):      
        P = calculate_P(weights, trans_label)      
          
        result_label[:, i],model = train_classify(trans_data, trans_label, test_data, P)  
        score_H = result_label[row_A:row_A + row_S, i]      
        pctg = np.sum(trans_label)/len(trans_label)      
        thred = pd.DataFrame(score_H).quantile(1-pctg)[0]      
        
        label_H = put_label(score_H,thred)      
        
        #計算在目標域上的錯誤率    
        error_rate = calculate_error_rate(label_S, label_H,   
                                                    weights[row_A:row_A + row_S, :])  
        # 防止過擬合     
        if error_rate > 0.5:      
            error_rate = 0.5      
        if error_rate == 0:      
            N = i      
            break       
                
        bata_T[0, i] = error_rate / (1 - error_rate)      
          
        # 調整目標域樣本權重      
        for j in range(row_S):      
            weights[row_A + j] = weights[row_A + j] * np.power(bata_T[0, i],  \
                                      (-np.abs(result_label[row_A + j, i] - label_S[j])))
          
        # 調整源域樣本權重      
        for j in range(row_A):      
            weights[j] = weights[j] * np.power(bata,   
                                               np.abs(result_label[j, i] - label_A[j]))  
        y_pred = result_label[(row_A + row_S):,i]      
        fpr_lr_train,tpr_lr_train,_ = roc_curve(label_test,y_pred)      
        train_ks = abs(fpr_lr_train - tpr_lr_train).max()      
        print('test_ks : ',train_ks,'當前第',i+1,'輪')      
              
        # 不再使用后一半學習器投票，而是只保留效果最好的邏輯回歸模型      
        if train_ks > best_ks :      
            best_ks = train_ks      
            best_round = i      
            best_model = model    
        # 當超過eadrly_stopping_rounds輪KS不再提升后，停止訓練  
        if best_round < i - early_stopping_rounds:  
            break  
    return best_ks,best_round,best_model   
    
# 訓練並得到最優模型best_model    
best_ks,best_round,best_model = Tr_lr_boost(trans_A,trans_S,label_A,label_S,
                                            test,label_test=val_y,N=300,
                                            early_stopping_rounds=20) 

y_pred = best_model.predict_proba(trans_S)[:,1]  
fpr_lr_train,tpr_lr_train,_ = roc_curve(label_S,y_pred)  
train_ks = abs(fpr_lr_train - tpr_lr_train).max()  
print('train_ks : ',train_ks)  #train_ks :  0.4629947544110634
  
y_pred = best_model.predict_proba(test)[:,1]  
fpr_lr,tpr_lr,_ = roc_curve(val_y,y_pred)  
val_ks = abs(fpr_lr - tpr_lr).max()  
print('val_ks : ',val_ks)  # val_ks :  0.39846160021324123
  
from matplotlib import pyplot as plt  
plt.plot(fpr_lr_train,tpr_lr_train,label = 'train LR')  
plt.plot(fpr_lr,tpr_lr,label = 'evl LR')  
plt.plot([0,1],[0,1],'k--')  
plt.xlabel('False positive rate')  
plt.ylabel('True positive rate')  
plt.title('ROC Curve')  
plt.legend(loc = 'best')  
plt.show()

View Code

感覺遷移學習，很多公司不采用，比較難辦

一、數據准備

主要是划分目標域，源域，oot樣本

import pandas as pd  
from sklearn.metrics import roc_auc_score,roc_curve,auc  
from sklearn.model_selection import train_test_split  
from sklearn import metrics  
from sklearn.linear_model import LogisticRegression  
from sklearn.svm import LinearSVC  
import numpy as np  
import random  
import math  
from sklearn.calibration import CalibratedClassifierCV   #概率矯正方法
data = pd.read_excel('xxx/tra_sample.xlsx')  
data.head()  
feature_lst = ['zx_score','msg_cnt','phone_num_cnt','register_days']    
train = data[data.type == 'target'].reset_index().copy()    #目標域
diff = data[data.type == 'origin'].reset_index().copy()    #源域
val = data[data.type == 'offtime'].reset_index().copy()   #oot 
  
''' 
TrainS 目標域樣本   
TrainA 源域樣本   
LabelS 目標域標簽   
LabelA 源域標簽   
'''  
  
train = train.loc[:1200]   #取前面1200行 
    
trans_S = train[feature_lst].copy()    
label_S = train['bad_ind'].copy()    
    
trans_A = diff[feature_lst].copy()    
label_A = diff['bad_ind'].copy()    
    
val_x =  val[feature_lst].copy()    
val_y = val['bad_ind'].copy()    
    
test = val_x.copy()

二、目標域建模

使用目標域建模，然后使用oot測試

#%% 使用目標域數據建模 ，ks相差10%，roc曲線不平滑，說明模型泛化能力差
lr_model = LogisticRegression(C=0.1,class_weight = 'balanced',solver = 'liblinear')  
lr_model.fit(trans_S,label_S)  
  
y_pred = lr_model.predict_proba(trans_S)[:,1]  
fpr_lr_train,tpr_lr_train,_ = roc_curve(label_S,y_pred)  
train_ks = abs(fpr_lr_train - tpr_lr_train).max()  
print('train_ks : ',train_ks)  #train_ks :  0.48500238435860754
  
y_pred = lr_model.predict_proba(test)[:,1]  
fpr_lr,tpr_lr,_ = roc_curve(val_y,y_pred)  
val_ks = abs(fpr_lr - tpr_lr).max()  
print('val_ks : ',val_ks)  #val_ks :  0.3887057754389137


from matplotlib import pyplot as plt  
plt.plot(fpr_lr_train,tpr_lr_train,label = 'train LR')  
plt.plot(fpr_lr,tpr_lr,label = 'evl LR')  
plt.plot([0,1],[0,1],'k--')  
plt.xlabel('False positive rate')  
plt.ylabel('True positive rate')  
plt.title('ROC Curve')  
plt.legend(loc = 'best')  
plt.show()  
trans_data = np.concatenate((trans_A, trans_S), axis=0)  
trans_label = np.concatenate((label_A, label_S), axis=0)

ks相差10%，roc曲線不平滑，說明模型泛化能力差

三、使用源域建模

使用源域建模，然后使用oot測試

#%%使用源域數據建模 ，ks不穩定，roc不穩定，泛化能力非常差
lr_model = LogisticRegression(C=0.3,class_weight = 'balanced',solver = 'liblinear')  
lr_model.fit(trans_A,label_A)  
  
y_pred = lr_model.predict_proba(trans_data)[:,1]  
fpr_lr_train,tpr_lr_train,_ = roc_curve(trans_label,y_pred)  
train_ks = abs(fpr_lr_train - tpr_lr_train).max()  
print('train_ks : ',train_ks)  #train_ks :  0.4910909493184976
  
y_pred = lr_model.predict_proba(test)[:,1]  
fpr_lr,tpr_lr,_ = roc_curve(val_y,y_pred)  
val_ks = abs(fpr_lr - tpr_lr).max()  
print('val_ks : ',val_ks)  #val_ks :  0.33077621830414

from matplotlib import pyplot as plt  
plt.plot(fpr_lr_train,tpr_lr_train,label = 'train LR')  
plt.plot(fpr_lr,tpr_lr,label = 'evl LR')  
plt.plot([0,1],[0,1],'k--')  
plt.xlabel('False positive rate')  
plt.ylabel('True positive rate')  
plt.title('ROC Curve')  
plt.legend(loc = 'best')  
plt.show()

ks不穩定，roc不穩定，泛化能力非常差

四、使用源域+目標域根據不同權重建模

使用源域+目標域，不斷調整錯分的樣本的權重，使用目標域、oot測試

#%%將源域和目標域的數據整合在一起   
import numpy as np      
import pandas as pd    
from sklearn.linear_model import LogisticRegression     
from sklearn.metrics import roc_curve  
   
def Tr_lr_boost(trans_A,trans_S,label_A,label_S,test,label_test,
                  N=500,early_stopping_rounds =100):    
    """   
        邏輯回歸的學習率、權重的大小，影響整體收斂的快慢    
        H 測試樣本分類結果    
        TrainS 目標域樣本    
        TrainA 源域樣本    
        LabelS 目標域標簽    
        LabelA 源域標簽    
        Test  測試樣本    
        N 迭代次數   
        early_stopping_rounds 提前停止輪次 
    """   
    #計算weight      
    def calculate_P(weights, label):      
        total = np.sum(weights)      
        return np.asarray(weights / total, order='C')      
          
    #用邏輯回歸作為基分類器，輸出概率      
    def train_classify(trans_data, trans_label, test_data, P):      
        clf = LogisticRegression(C=0.3,class_weight = 'balanced',solver='liblinear')      
        clf.fit(trans_data, trans_label, sample_weight=P[:, 0])      
        return clf.predict_proba(test_data)[:,1],clf      
          
    #計算在目標域上面的錯誤率      
    def calculate_error_rate(label_R, label_H, weight):      
        total = np.sum(weight)      
        return np.sum(weight[:, 0] / total * np.abs(label_R - label_H))      
          
    #根據邏輯回歸輸出的score的得到標簽，注意這里不能用predict直接輸出標簽      
    def put_label(score_H,thred):      
        new_label_H = []      
        for i in score_H:      
            if i <= thred:      
                new_label_H.append(0)      
            else:      
                new_label_H.append(1)      
        return new_label_H      
          
    #指定迭代次數，相當於集成模型中基模型的數量      
         
          
    #拼接數據集    
    trans_data = np.concatenate((trans_A, trans_S), axis=0)      
    trans_label = np.concatenate((label_A, label_S), axis=0)      
        
    #三個數據集樣本數    
    row_A = trans_A.shape[0]      
    row_S = trans_S.shape[0]      
    row_T = test.shape[0]      
        
    #三個數據集合並為打分數據集    
    test_data = np.concatenate((trans_data, test), axis=0)      
          
    # 初始化權重      
    weights_A = np.ones([row_A, 1])/row_A      
    weights_S = np.ones([row_S, 1])/row_S*2  # 目標域的權重會比源域的權重高一些   
    weights = np.concatenate((weights_A, weights_S), axis=0)      
        
    #按照公式初始化beta值    
    bata = 1 / (1 + np.sqrt(2 * np.log(row_A / N)))    #N是多少，N是迭代次數  
          
        
    # 存每一次迭代的bata值=error_rate / (1 - error_rate)      
    bata_T = np.zeros([1, N])      
    # 存儲每次迭代的標簽    
    result_label = np.ones([row_A + row_S + row_T, N])       
          
    trans_data = np.asarray(trans_data, order='C')      
    trans_label = np.asarray(trans_label, order='C')      
    test_data = np.asarray(test_data, order='C')      
        
    #最優KS      
    best_ks = -1      
    #最優基模型數量          
    best_round = -1    
    #最優模型      
    best_model = -1     
         
    """ 
    初始化結束    
    正式開始訓練  
    """     
        
    for i in range(N):      
        P = calculate_P(weights, trans_label)      
          
        result_label[:, i],model = train_classify(trans_data, trans_label, test_data, P)  
        score_H = result_label[row_A:row_A + row_S, i]      
        pctg = np.sum(trans_label)/len(trans_label)      
        thred = pd.DataFrame(score_H).quantile(1-pctg)[0]      
        
        label_H = put_label(score_H,thred)      
        
        #計算在目標域上的錯誤率    
        error_rate = calculate_error_rate(label_S, label_H,   
                                                    weights[row_A:row_A + row_S, :])  
        # 防止過擬合     
        if error_rate > 0.5:      
            error_rate = 0.5      
        if error_rate == 0:      
            N = i      
            break       
                
        bata_T[0, i] = error_rate / (1 - error_rate)      
          
        # 調整目標域樣本權重      
        for j in range(row_S):      
            weights[row_A + j] = weights[row_A + j] * np.power(bata_T[0, i],  \
                                      (-np.abs(result_label[row_A + j, i] - label_S[j])))
          
        # 調整源域樣本權重      
        for j in range(row_A):      
            weights[j] = weights[j] * np.power(bata,   
                                               np.abs(result_label[j, i] - label_A[j]))  
        y_pred = result_label[(row_A + row_S):,i]      
        fpr_lr_train,tpr_lr_train,_ = roc_curve(label_test,y_pred)      
        train_ks = abs(fpr_lr_train - tpr_lr_train).max()      
        print('test_ks : ',train_ks,'當前第',i+1,'輪')      
              
        # 不再使用后一半學習器投票，而是只保留效果最好的邏輯回歸模型      
        if train_ks > best_ks :      
            best_ks = train_ks      
            best_round = i      
            best_model = model    
        # 當超過eadrly_stopping_rounds輪KS不再提升后，停止訓練  
        if best_round < i - early_stopping_rounds:  
            break  
    return best_ks,best_round,best_model   
    
# 訓練並得到最優模型best_model    
best_ks,best_round,best_model = Tr_lr_boost(trans_A,trans_S,label_A,label_S,
                                            test,label_test=val_y,N=300,
                                            early_stopping_rounds=20) 

y_pred = best_model.predict_proba(trans_S)[:,1]  
fpr_lr_train,tpr_lr_train,_ = roc_curve(label_S,y_pred)  
train_ks = abs(fpr_lr_train - tpr_lr_train).max()  
print('train_ks : ',train_ks)  #train_ks :  0.4629947544110634
  
y_pred = best_model.predict_proba(test)[:,1]  
fpr_lr,tpr_lr,_ = roc_curve(val_y,y_pred)  
val_ks = abs(fpr_lr - tpr_lr).max()  
print('val_ks : ',val_ks)  # val_ks :  0.39846160021324123
  
from matplotlib import pyplot as plt  
plt.plot(fpr_lr_train,tpr_lr_train,label = 'train LR')  
plt.plot(fpr_lr,tpr_lr,label = 'evl LR')  
plt.plot([0,1],[0,1],'k--')  
plt.xlabel('False positive rate')  
plt.ylabel('True positive rate')  
plt.title('ROC Curve')  
plt.legend(loc = 'best')  
plt.show()

至於結果為什么和梅老師的結果不太一樣，可能是數據的原因

免責聲明！

本站轉載的文章為個人學習借鑒使用，本站對版權不負任何法律責任。如果侵犯了您的隱私權益，請聯系本站郵箱yoyou2525@163.com刪除。

猜您在找 tensorflow遷移學習retrain.py python學習-- 數據庫遷移 python manage.py makemigrations 和 python manage.py migrate 遷移學習遷移學習遷移學習遷移學習遷移學習 Python3.3 學習筆記6 - 文件 JFinal 3.3 學習 -- JFinalConfig （配置web項目） Python3.3 學習筆記4 - 函數 - lambda