遺傳算法適應度的選擇:
機器學習的適應度可以是任何性能指標 —准確度,精確度,召回率,F1分數等等。根據適應度值,我們選擇表現最佳的父母(“適者生存”),作為幸存的種群。
交配:
存活下來的群體中的父母將通過交配產生后代,使用兩個步驟的組合:交叉/重組和突變。
交叉:交配父母的基因(參數)將被重新組合,產生后代,每個孩子從父母雙方遺傳一些基因(參數);
突變:一些基因(參數)的值將被改變以保持遺傳多樣性,這使得遺傳算法通常能夠得到更好的解決方案。
備注:我們保留幸存的父母,以便保留最好的適應度參數,以防后代的適應度值比父母差。
xgboost超參數搜索遺傳算法模塊:
模塊將具有遵循以下四個步驟的功能:初始化種群,選擇,交叉,變異
import numpy as np import random from sklearn.metrics import f1_score import xgboost class GeneticXgboost: def __init__(self,num_parents=None): """ param num_parents:種群個體的數量 """ self.num_parents = num_parents def initilialize_poplulation(self): """ 初始化種群,即生成規定數量的種群的基因 learning_rate,n_estimators,max_depth,min_child_weightsubsample,olsample_bytree,gamma return:array,shape=[self.num_parents,num_gene] """ learningRate = np.empty([self.num_parents, 1]) nEstimators = np.empty([self.num_parents, 1],dtype = np.uint8) maxDepth = np.empty([self.num_parents, 1],dtype = np.uint8) minChildWeight = np.empty([self.num_parents,1]) gammaValue = np.empty([self.num_parents,1]) subSample = np.empty([self.num_parents,1]) colSampleByTree = np.empty([self.num_parents,1]) for i in range(self.num_parents): #生成每個個體 learningRate[i] = round(np.random.uniform(0.01, 1), 2) nEstimators[i] = int(random.randrange(10, 1500, step = 25)) maxDepth[i] = int(random.randrange(1, 10, step=1)) minChildWeight[i] = round(random.uniform(0.01, 10.0),2) gammaValue[i] = round(random.uniform(0.01, 10.0),2) subSample[i] = round(random.uniform(0.01, 1.0), 2) colSampleByTree[i] = round(random.uniform(0.01, 1.0), 2) population = np.concatenate((learningRate,nEstimators,maxDepth,minChildWeight, gammaValue,subSample,colSampleByTree),axis=1) return population def fitness_function(self,y_true,y_pred): """ 定義適應度函數 """ fitness = round((f1_score(y_true,y_pred,average='weighted')),4) return fitness def fitness_compute(self,population,dMatrixTrain,dMatrixtest,y_test): """ 計算適應度值 param population: 種群 param dMatrixTrain:訓練數據,(X,y) param dMatrixtest: 測試數據, (x,y) param y_test: 測試數據y return 種群中每個個體的適應度值 """ f1_Score = [] for i in range(population.shape[0]):#遍歷種群中的每一個個體 param = {'objective': 'binary:logistic', 'learning_rate': population[i][0], 'n_estimators': population[i][1], 'max_depth': int(population[i][2]), 'min_child_weight': population[i][3], 'gamma': population[i][4], 'subsample': population[i][5], 'colsample_bytree': population[i][6], 'seed': 24} num_round = 100 model = xgboost.train(param,dMatrixTrain,num_round) preds = model.predict(dMatrixtest) preds = preds>0.5 f1 = self.fitness_function(y_test,preds) f1_Score.append(f1) return f1_Score def parents_selection(self,population,fitness,num_store): """ 根據適應度值來選擇保留種群中的個體數量 param population:種群,shape=[self.num_parents,num_gene] param num_store: 需要保留的個體數量 param fitness: 適應度值,array return 種群中保留的最好個體,shape=[num_store,num_gene] """ #用於存儲需要保留的個體 selectedParents = np.empty((num_store,population.shape[1])) for parentId in range(num_store): #找到最大值的索引 bestFitnessId = np.where(fitness == np.max(fitness)) bestFitnessId = bestFitnessId[0][0] #保存對應的個體基因 selectedParents[parentId,:] = population[bestFitnessId, :] #將提取了值的最大適應度賦值-1,避免再次提取到 fitness[bestFitnessId] = -1 return selectedParents def crossover_uniform(self,parents,childrenSize): """ 交叉 我們使用均勻交叉,其中孩子的每個參數將基於特定分布從父母中獨立地選擇 param parents: param childrenSize: return """ crossoverPointIndex = np.arange(0,np.uint8(childrenSize[1]),1,dtype= np.uint8) crossoverPointIndex1 = np.random.randint(0,np.uint8(childrenSize[1]), np.uint8(childrenSize[1]/2)) crossoverPointIndex2 = np.array(list(set(crossoverPointIndex)-set(crossoverPointIndex1))) children = np.empty(childrenSize) #將兩個父代個體進行交叉 for i in range(childrenSize[0]): #find parent1 index parent1_index = i%parents.shape[0] #find parent 2 index parent2_index = (i+1)%parents.shape[0] #insert parameters based on random selected indexes in parent1 children[i,crossoverPointIndex1] = parents[parent1_index,crossoverPointIndex1] #insert parameters based on random selected indexes in parent1 children[i,crossoverPointIndex2] = parents[parent2_index,crossoverPointIndex2] return children def mutation(self, crossover, num_param): ''' 突變 隨機選擇一個參數並通過隨機量改變值來引入子代的多樣性 param crossover:要進行突變的種群 param num_param:參數的個數 return ''' #定義每個參數允許的最小值和最大值 minMaxValue = np.zeros((num_param,2)) minMaxValue[0,:] = [0.01, 1.0] #min/max learning rate minMaxValue[1,:] = [10, 2000] #min/max n_estimator minMaxValue[2,:] = [1, 15] #min/max depth minMaxValue[3,:] = [0, 10.0] #min/max child_weight minMaxValue[4,:] = [0.01, 10.0] #min/max gamma minMaxValue[5,:] = [0.01, 1.0] #min/maxsubsample minMaxValue[6,:] = [0.01, 1.0] #min/maxcolsample_bytree #突變隨機改變每個后代中的單個基因 mutationValue = 0 parameterSelect = np.random.randint(0,7,1) if parameterSelect == 0: #learning_rate mutationValue = round(np.random.uniform(-0.5, 0.5), 2) if parameterSelect == 1: #n_estimators mutationValue = np.random.randint(-200, 200, 1) if parameterSelect == 2: #max_depth mutationValue = np.random.randint(-5, 5, 1) if parameterSelect == 3: #min_child_weight mutationValue = round(np.random.uniform(5, 5), 2) if parameterSelect == 4: #gamma mutationValue = round(np.random.uniform(-2, 2), 2) if parameterSelect == 5: #subsample mutationValue = round(np.random.uniform(-0.5, 0.5), 2) if parameterSelect == 6: #colsample mutationValue = round(np.random.uniform(-0.5, 0.5), 2) #通過更改一個參數來引入變異,如果超出范圍則設置為max或min for idx in range(crossover.shape[0]): crossover[idx, parameterSelect] = crossover[idx,parameterSelect]+mutationValue if(crossover[idx,parameterSelect]>minMaxValue[parameterSelect,1]): crossover[idx,parameterSelect] = minMaxValue[parameterSelect,1] if(crossover[idx,parameterSelect] < minMaxValue[parameterSelect,0]): crossover[idx,parameterSelect] = minMaxValue[parameterSelect,0] return crossover ######################參數收縮測試############################################## from sklearn.datasets import load_breast_cancer from sklearn.model_selection import train_test_split from sklearn.preprocessing import StandardScaler X,y = load_breast_cancer(return_X_y=True) X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=1) ss = StandardScaler() X_train = ss.fit_transform(X_train) X_test = ss.transform(X_test) xgDMatrixTrain = xgboost.DMatrix(X_train,y_train) xgbDMatrixTest = xgboost.DMatrix(X_test, y_test) number_of_parents = 8 #初始種群數量 number_of_generations = 4 #種群繁殖代數,即迭代次數 number_of_parameters = 7 #將被優化的參數數量 number_of_parents_mating = 4 #每代被保留的個體數量 gx = GeneticXgboost(num_parents=number_of_parents) #定義種群的大小 populationSize = (number_of_parents,number_of_parameters) #初始種群 population = gx.initilialize_poplulation() #定義一個數組來存儲fitness歷史 FitnessHistory = np.empty([number_of_generations+1, number_of_parents]) #定義一個數組來存儲每個父節點和生成的每個參數的值 populationHistory = np.empty([(number_of_generations+1)*number_of_parents, number_of_parameters]) #歷史記錄中插入初始參數的值 populationHistory[0:number_of_parents,:] = population #訓練 for generation in range(number_of_generations): print("This is number %s generation" %(generation)) #train the dataset and obtain fitness FitnessValue = gx.fitness_compute(population=population, dMatrixTrain=xgDMatrixTrain, dMatrixtest=xgbDMatrixTest, y_test=y_test) FitnessHistory[generation,:] = FitnessValue print('Best F1 score in the iteration = {}'.format(np.max(FitnessHistory[generation,:]))) #保留的父代 parents = gx.parents_selection(population=population, fitness=FitnessValue, num_store=number_of_parents_mating) #生成的子代 children = gx.crossover_uniform(parents=parents, childrenSize=(populationSize[0]-parents.shape[0],number_of_parameters)) #增加突變以創造遺傳多樣性 children_mutated = gx.mutation(children, number_of_parameters) #創建新的種群,其中將包含以前根據fitness value選擇的父代,和生成的子代 population[0:parents.shape[0], :] = parents population[parents.shape[0]:, :] = children_mutated populationHistory[(generation+1)*number_of_parents:(generation+1)*number_of_parents+number_of_parents,:]=population #最終迭代的最佳解決方案 fitness = gx.fitness_compute(population=population, dMatrixTrain=xgDMatrixTrain, dMatrixtest=xgbDMatrixTest, y_test=y_test) bestFitnessIndex = np.where(fitness == np.max(fitness))[0][0] print("Best fitness is =", fitness[bestFitnessIndex]) print("Best parameters are:") print('learning_rate=', population[bestFitnessIndex][0]) print('n_estimators=', population[bestFitnessIndex][1]) print('max_depth=', int(population[bestFitnessIndex][2])) print('min_child_weight=', population[bestFitnessIndex][3]) print('gamma=', population[bestFitnessIndex][4]) print('subsample=', population[bestFitnessIndex][5]) print('colsample_bytree=', population[bestFitnessIndex][6])
轉載:https://www.toutiao.com/i6602143792273293837/