xgboost的遺傳算法調參

本文轉載自查看原文 2018-09-25 17:11 770 遺傳算法/ 參數優化/ xgboost/ 接口開發

遺傳算法適應度的選擇：

機器學習的適應度可以是任何性能指標 —准確度，精確度，召回率，F1分數等等。根據適應度值，我們選擇表現最佳的父母（“適者生存”），作為幸存的種群。

交配：

存活下來的群體中的父母將通過交配產生后代，使用兩個步驟的組合:交叉/重組和突變。

交叉：交配父母的基因(參數)將被重新組合，產生后代，每個孩子從父母雙方遺傳一些基因(參數)；

突變：一些基因(參數)的值將被改變以保持遺傳多樣性，這使得遺傳算法通常能夠得到更好的解決方案。

備注：我們保留幸存的父母，以便保留最好的適應度參數，以防后代的適應度值比父母差。

xgboost超參數搜索遺傳算法模塊：

模塊將具有遵循以下四個步驟的功能：初始化種群，選擇，交叉，變異

import numpy as np
import random 
from sklearn.metrics import f1_score
import xgboost 

class GeneticXgboost:
    def __init__(self,num_parents=None):
        """
        param num_parents:種群個體的數量
        
        """
        self.num_parents = num_parents
    
    
    def initilialize_poplulation(self):
        """
        初始化種群,即生成規定數量的種群的基因        
        learning_rate,n_estimators，max_depth,min_child_weightsubsample,olsample_bytree,gamma
        return：array,shape=[self.num_parents,num_gene]        
        """
        learningRate = np.empty([self.num_parents, 1])
        nEstimators  = np.empty([self.num_parents, 1],dtype = np.uint8)
        maxDepth = np.empty([self.num_parents, 1],dtype = np.uint8)
        minChildWeight = np.empty([self.num_parents,1])
        gammaValue = np.empty([self.num_parents,1])
        subSample = np.empty([self.num_parents,1])
        colSampleByTree = np.empty([self.num_parents,1])
        for i in range(self.num_parents): 
            #生成每個個體
            learningRate[i]    = round(np.random.uniform(0.01, 1), 2)
            nEstimators[i]     = int(random.randrange(10, 1500, step = 25))
            maxDepth[i]        = int(random.randrange(1, 10, step=1))
            minChildWeight[i]  = round(random.uniform(0.01, 10.0),2)
            gammaValue[i]      = round(random.uniform(0.01, 10.0),2)
            subSample[i]       = round(random.uniform(0.01, 1.0), 2)
            colSampleByTree[i] = round(random.uniform(0.01, 1.0), 2)
            population = np.concatenate((learningRate,nEstimators,maxDepth,minChildWeight,
                                         gammaValue,subSample,colSampleByTree),axis=1)
        return population
    
    def fitness_function(self,y_true,y_pred):
        """
        定義適應度函數
        """
        fitness = round((f1_score(y_true,y_pred,average='weighted')),4)
        return fitness
    
    
    def fitness_compute(self,population,dMatrixTrain,dMatrixtest,y_test):
        """
        計算適應度值
        param population:  種群
        param dMatrixTrain:訓練數據，（X,y)
        param dMatrixtest: 測試數據, (x,y)
        param y_test:      測試數據y
        return 種群中每個個體的適應度值               
        """
        f1_Score = []
        for i in range(population.shape[0]):#遍歷種群中的每一個個體
            param = {'objective':       'binary:logistic',
                     'learning_rate':    population[i][0],
                     'n_estimators':     population[i][1], 
                     'max_depth':        int(population[i][2]), 
                     'min_child_weight': population[i][3],
                     'gamma':            population[i][4], 
                     'subsample':        population[i][5],
                     'colsample_bytree': population[i][6],
                     'seed': 24}
            num_round = 100
            model = xgboost.train(param,dMatrixTrain,num_round)
            preds = model.predict(dMatrixtest)
            preds = preds>0.5
            f1 = self.fitness_function(y_test,preds)
            f1_Score.append(f1)
        return f1_Score
    
    def parents_selection(self,population,fitness,num_store):
        """
        根據適應度值來選擇保留種群中的個體數量
        param population:種群，shape=[self.num_parents,num_gene]
        param num_store: 需要保留的個體數量  
        param fitness:   適應度值，array
        return 種群中保留的最好個體，shape=[num_store,num_gene]
        """
        #用於存儲需要保留的個體
        selectedParents = np.empty((num_store,population.shape[1])) 
        for parentId in range(num_store):
            #找到最大值的索引
            bestFitnessId = np.where(fitness == np.max(fitness))
            bestFitnessId = bestFitnessId[0][0]
            #保存對應的個體基因
            selectedParents[parentId,:] = population[bestFitnessId, :]
            #將提取了值的最大適應度賦值-1，避免再次提取到
            fitness[bestFitnessId] = -1
            
        return selectedParents
    
    def crossover_uniform(self,parents,childrenSize):
        """
        交叉
        我們使用均勻交叉，其中孩子的每個參數將基於特定分布從父母中獨立地選擇
        param parents:
        param childrenSize:
        return         
        """
        
        crossoverPointIndex = np.arange(0,np.uint8(childrenSize[1]),1,dtype= np.uint8)
        
        crossoverPointIndex1 = np.random.randint(0,np.uint8(childrenSize[1]),
                                                 np.uint8(childrenSize[1]/2))
        
        crossoverPointIndex2 = np.array(list(set(crossoverPointIndex)-set(crossoverPointIndex1)))
        
        children = np.empty(childrenSize)
        
        #將兩個父代個體進行交叉
        for i in range(childrenSize[0]): 
            #find parent1 index 
            parent1_index = i%parents.shape[0]
            #find parent 2 index
            parent2_index = (i+1)%parents.shape[0]
            #insert parameters based on random selected indexes in parent1
            children[i,crossoverPointIndex1] = parents[parent1_index,crossoverPointIndex1]
            #insert parameters based on random selected indexes in parent1
            children[i,crossoverPointIndex2] = parents[parent2_index,crossoverPointIndex2]
        return children
    
    def mutation(self, crossover, num_param):
        '''
        突變
        隨機選擇一個參數並通過隨機量改變值來引入子代的多樣性
        param crossover:要進行突變的種群
        param num_param:參數的個數
        return         
        '''
        
        #定義每個參數允許的最小值和最大值
        minMaxValue = np.zeros((num_param,2))
        
        minMaxValue[0,:] = [0.01, 1.0]  #min/max learning rate
        minMaxValue[1,:] = [10, 2000]   #min/max n_estimator
        minMaxValue[2,:] = [1, 15]      #min/max depth
        minMaxValue[3,:] = [0, 10.0]    #min/max child_weight
        minMaxValue[4,:] = [0.01, 10.0] #min/max gamma
        minMaxValue[5,:] = [0.01, 1.0]  #min/maxsubsample
        minMaxValue[6,:] = [0.01, 1.0]  #min/maxcolsample_bytree
        
        #突變隨機改變每個后代中的單個基因
        mutationValue = 0
        parameterSelect = np.random.randint(0,7,1)
        
        if parameterSelect == 0: 
            #learning_rate
            mutationValue = round(np.random.uniform(-0.5, 0.5), 2)
        if parameterSelect == 1: 
            #n_estimators
            mutationValue = np.random.randint(-200, 200, 1)
        if parameterSelect == 2: 
            #max_depth
            mutationValue = np.random.randint(-5, 5, 1)
        if parameterSelect == 3: 
            #min_child_weight
            mutationValue = round(np.random.uniform(5, 5), 2)
        if parameterSelect == 4: 
            #gamma
            mutationValue = round(np.random.uniform(-2, 2), 2)
        if parameterSelect == 5: 
            #subsample
            mutationValue = round(np.random.uniform(-0.5, 0.5), 2)
        if parameterSelect == 6: 
            #colsample
            mutationValue = round(np.random.uniform(-0.5, 0.5), 2)
            
        #通過更改一個參數來引入變異，如果超出范圍則設置為max或min
        for idx in range(crossover.shape[0]):
            crossover[idx, parameterSelect] = crossover[idx,parameterSelect]+mutationValue
            
            if(crossover[idx,parameterSelect]>minMaxValue[parameterSelect,1]):            
                crossover[idx,parameterSelect] = minMaxValue[parameterSelect,1]
            
            if(crossover[idx,parameterSelect] < minMaxValue[parameterSelect,0]):
                crossover[idx,parameterSelect] = minMaxValue[parameterSelect,0]
            
        return crossover    


######################參數收縮測試##############################################
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

X,y = load_breast_cancer(return_X_y=True)

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=1)

ss = StandardScaler()
X_train = ss.fit_transform(X_train)
X_test  = ss.transform(X_test)

xgDMatrixTrain = xgboost.DMatrix(X_train,y_train)
xgbDMatrixTest = xgboost.DMatrix(X_test, y_test)


number_of_parents = 8     #初始種群數量
number_of_generations = 4 #種群繁殖代數，即迭代次數
number_of_parameters = 7  #將被優化的參數數量
number_of_parents_mating = 4  #每代被保留的個體數量

gx = GeneticXgboost(num_parents=number_of_parents)

#定義種群的大小
populationSize = (number_of_parents,number_of_parameters)

#初始種群
population = gx.initilialize_poplulation()
#定義一個數組來存儲fitness歷史
FitnessHistory = np.empty([number_of_generations+1, number_of_parents])
#定義一個數組來存儲每個父節點和生成的每個參數的值
populationHistory = np.empty([(number_of_generations+1)*number_of_parents,
                               number_of_parameters])
#歷史記錄中插入初始參數的值
populationHistory[0:number_of_parents,:] = population

#訓練
for generation in range(number_of_generations):
    print("This is number %s generation" %(generation))
    #train the dataset and obtain fitness
    FitnessValue = gx.fitness_compute(population=population,
                                      dMatrixTrain=xgDMatrixTrain, 
                                      dMatrixtest=xgbDMatrixTest, 
                                      y_test=y_test)
    
    FitnessHistory[generation,:] = FitnessValue
    print('Best F1 score in the iteration = {}'.format(np.max(FitnessHistory[generation,:])))
    #保留的父代
    parents = gx.parents_selection(population=population,
                                                   fitness=FitnessValue,
                                                   num_store=number_of_parents_mating)
    #生成的子代
    children = gx.crossover_uniform(parents=parents, 
                     childrenSize=(populationSize[0]-parents.shape[0],number_of_parameters))
    
    #增加突變以創造遺傳多樣性
    children_mutated = gx.mutation(children, number_of_parameters)
    
    #創建新的種群，其中將包含以前根據fitness value選擇的父代，和生成的子代
    population[0:parents.shape[0], :] = parents 
    population[parents.shape[0]:,  :] = children_mutated
    populationHistory[(generation+1)*number_of_parents:(generation+1)*number_of_parents+number_of_parents,:]=population
    

#最終迭代的最佳解決方案  
fitness = gx.fitness_compute(population=population, 
                             dMatrixTrain=xgDMatrixTrain, 
                             dMatrixtest=xgbDMatrixTest, 
                             y_test=y_test)

bestFitnessIndex = np.where(fitness == np.max(fitness))[0][0]
print("Best fitness is =", fitness[bestFitnessIndex])

print("Best parameters are:")
print('learning_rate=',        population[bestFitnessIndex][0])
print('n_estimators=',         population[bestFitnessIndex][1])
print('max_depth=',            int(population[bestFitnessIndex][2])) 
print('min_child_weight=',     population[bestFitnessIndex][3])
print('gamma=',                population[bestFitnessIndex][4])
print('subsample=',            population[bestFitnessIndex][5])
print('colsample_bytree=',     population[bestFitnessIndex][6])

轉載：https://www.toutiao.com/i6602143792273293837/

免責聲明！

本站轉載的文章為個人學習借鑒使用，本站對版權不負任何法律責任。如果侵犯了您的隱私權益，請聯系本站郵箱yoyou2525@163.com刪除。

猜您在找 遺傳算法調參使用遺傳算法進行調參【決策樹】遺傳算法詳解遺傳算法 TSP 遺傳算法遺傳算法（GA）遺傳算法優化遺傳算法簡介遺傳算法詳解（一）遺傳算法簡介