使用遺傳算法進行調參【決策樹】

本文轉載自查看原文 2019-04-04 17:46 652 有趣的算法集

背景

最近接到一個項目，使用遺傳算法對決策樹進行調參；以前都是使用網格搜索來調參，沒想到也可以用ga來做這件事情，再加上以前也寫過比較多的ga算法，也就接了下來，本來以為要花一點時間來搞，實際上熟悉的話2-3個小時就能搞定。

算法

做項目肯定是要用庫的啦（不可能自己寫的），選擇使用sklearn的決策樹，ga算法流程比較清晰，就自己手寫了，下面關鍵介紹ga算法的幾個步驟是如何做的。

初始化

選擇決策樹比較重要的三個參數"max_depth", "min_samples_split", "max_leaf_nodes"，窮舉這三個參數可能的值進行初始化

1 def init():
2     forest = []
3     for max_depth in range(5, 31, 3):
4         for min_samples_split in range(5, 25, 5):
5             for max_leaf_nodes in range(5, 25, 5):
6                 forest.append(make_tree([max_depth, min_samples_split, max_leaf_nodes]))
7     return forest

選擇

使用准確率作為評分依據得到累計概率

1 def tree_score(X, Y, clf):
2     kf = KFold(n_splits=5)
3     score = []
4     for train_index, valid_index in kf.split(X):
5         clf.fit(X[train_index], Y[train_index])
6         pred = clf.predict(X[valid_index])
7         score.append(accuracy_score(y_true=Y[valid_index], y_pred=pred))
8     return np.mean(score)

 1 def adaption(X, Y, forest):
 2     score = []
 3     for t in forest:
 4         score.append(tree_score(X, Y, t))
 5     best_pos = np.argmax(score)
 6     global BEST_TREE
 7     BEST_TREE = copy.deepcopy(forest[best_pos])
 8     sm = np.sum(score)
 9     ada = score / sm
10     for i in range(1, len(ada)):
11         ada[i] = ada[i] + ada[i - 1]
12     return ada

選擇這里可以注意一下，可以使用精英策略，即：把當前這一輪最好的個體，直接送入下一代中。這個策略在提升算法的穩定性上又很大用處

交叉

交叉使用的是參數的交叉，比如clf1，和clf2 然后隨機得到一個找到一個交換參數的位置p，進行交叉

 1 def _cross_2_tree(t1, t2):
 2     sz = len(param)
 3 
 4     t1_param_value = _dict_get_value_list(t1.__dict__, param)
 5     t2_param_value = _dict_get_value_list(t2.__dict__, param)
 6     pos = random.randint(0, sz - 1)
 7     t1_left = t1_param_value[0:pos + 1]
 8     t1_right = t1_param_value[pos + 1:]
 9 
10     t2_left = t2_param_value[0:pos + 1]
11     t2_right = t2_param_value[pos + 1:]
12 
13     t1_left.extend(t2_right)
14     t2_left.extend(t1_right)
15     return [make_tree(t1_left), make_tree(t2_left)]
16 
17 
18 def cross(forest):
19     result = []
20     sz = len(forest)
21     for i in range(1, sz, 2):
22         result.extend(_cross_2_tree(forest[i - 1], forest[i]))
23     return result

變異

這一步使用比較簡單的策略，直接在參數上進行+1或者-1操作

 1 def variation(forest):
 2     result = []
 3     for t in forest:
 4         r = random.random()
 5         if r < VAR_P:
 6             result.append(t)
 7             continue
 8 
 9         # 變異
10         sz = len(param)
11         pos = random.randint(0, sz - 1)
12         val = t.__dict__[param[pos]]
13         up = random.random()
14 
15         if up > 0.5:
16             val = val + 1
17         else:
18             val = val - 1
19 
20         if val < 2:
21             val = 2
22         t.__dict__[param[pos]] = val
23         result.append(t)
24     return result

完整代碼

  1 import pandas as pd
  2 import numpy as np
  3 from sklearn.tree import DecisionTreeClassifier
  4 from sklearn.model_selection import train_test_split
  5 from sklearn.model_selection import KFold
  6 from sklearn.metrics import accuracy_score
  7 import random
  8 import copy
  9 import matplotlib.pyplot as plt
 10 
 11 param = ["max_depth", "min_samples_split", "max_leaf_nodes"]
 12 epochs = 300
 13 VAR_P = 0.4
 14 BEST_TREE = None
 15 
 16 
 17 def make_tree(param_value):
 18     p = dict(zip(param, param_value))
 19     return DecisionTreeClassifier(**p)
 20 
 21 
 22 def init():
 23     forest = []
 24     for max_depth in range(5, 31, 3):
 25         for min_samples_split in range(5, 25, 5):
 26             for max_leaf_nodes in range(5, 25, 5):
 27                 forest.append(make_tree([max_depth, min_samples_split, max_leaf_nodes]))
 28     return forest
 29 
 30 def tree_score(X, Y, clf):
 31     kf = KFold(n_splits=5)
 32     score = []
 33     for train_index, valid_index in kf.split(X):
 34         clf.fit(X[train_index], Y[train_index])
 35         pred = clf.predict(X[valid_index])
 36         score.append(accuracy_score(y_true=Y[valid_index], y_pred=pred))
 37     return np.mean(score)
 38 
 39 
 40 def evulate_forest(X, Y, forest):
 41     score = []
 42     for t in forest:
 43         score.append(tree_score(X, Y, t))
 44     worse_pos = np.argmin(score)
 45     global BEST_TREE
 46     forest[worse_pos] = BEST_TREE
 47     score[worse_pos] = tree_score(X, Y, BEST_TREE)
 48 
 49     score.sort(reverse=True)
 50     return score, np.mean(score)
 51 
 52 
 53 def adaption(X, Y, forest):
 54     score = []
 55     for t in forest:
 56         score.append(tree_score(X, Y, t))
 57     best_pos = np.argmax(score)
 58     global BEST_TREE
 59     BEST_TREE = copy.deepcopy(forest[best_pos])
 60     sm = np.sum(score)
 61     ada = score / sm
 62     for i in range(1, len(ada)):
 63         ada[i] = ada[i] + ada[i - 1]
 64     return ada
 65 
 66 
 67 def choose_trees(forest, ada):
 68     sz = len(forest)
 69     result = []
 70     for i in range(sz):
 71         r = random.random()
 72         for j in range(len(ada)):
 73             if r <= ada[j]:
 74                 result.append(copy.deepcopy(forest[j]))
 75                 break
 76     return result
 77 
 78 
 79 def _dict_get_value_list(mp, key_list):
 80     value_list = []
 81     for key in key_list:
 82         value_list.append(mp.get(key))
 83     return value_list
 84 
 85 
 86 def _cross_2_tree(t1, t2):
 87     sz = len(param)
 88 
 89     t1_param_value = _dict_get_value_list(t1.__dict__, param)
 90     t2_param_value = _dict_get_value_list(t2.__dict__, param)
 91     pos = random.randint(0, sz - 1)
 92     t1_left = t1_param_value[0:pos + 1]
 93     t1_right = t1_param_value[pos + 1:]
 94 
 95     t2_left = t2_param_value[0:pos + 1]
 96     t2_right = t2_param_value[pos + 1:]
 97 
 98     t1_left.extend(t2_right)
 99     t2_left.extend(t1_right)
100     return [make_tree(t1_left), make_tree(t2_left)]
101 
102 
103 def cross(forest):
104     result = []
105     sz = len(forest)
106     for i in range(1, sz, 2):
107         result.extend(_cross_2_tree(forest[i - 1], forest[i]))
108     return result
109 
110 
111 def variation(forest):
112     result = []
113     for t in forest:
114         r = random.random()
115         if r < VAR_P:
116             result.append(t)
117             continue
118 
119         # 變異
120         sz = len(param)
121         pos = random.randint(0, sz - 1)
122         val = t.__dict__[param[pos]]
123         up = random.random()
124 
125         if up > 0.5:
126             val = val + 1
127         else:
128             val = val - 1
129 
130         if val < 2:
131             val = 2
132         t.__dict__[param[pos]] = val
133         result.append(t)
134     return result
135 
136 
137 df = pd.read_csv("../dataset/data.csv", index_col=0)
138 X = df.iloc[:, 1:].values
139 Y = df.iloc[:, 0].values
140 forest = init()
141 
142 mean_score_arr = []
143 
144 for i in range(epochs):
145     ada = adaption(X, Y, forest)
146     forest = choose_trees(forest, ada)
147     forest = cross(forest)
148     forest = variation(forest)
149     score, mean = evulate_forest(X, Y, forest)
150     mean_score_arr.append(mean)
151 
152     print(i, "/", epochs, ":")
153     print("mean:", mean)
154 
155 plt.plot(np.arange(len(mean_score_arr)), mean_score_arr)
156 plt.show()

總結

感覺使用ga進行調參很雞肋，還不如使用網格搜索來的快，但是作為一種思想可以學習一下的。

最近搞了一個人工智能交流的群：831852635，有興趣的可以加一下！

免責聲明！

本站轉載的文章為個人學習借鑒使用，本站對版權不負任何法律責任。如果侵犯了您的隱私權益，請聯系本站郵箱yoyou2525@163.com刪除。

猜您在找 遺傳算法調參 xgboost的遺傳算法調參決策樹（四）決策樹調參機器學習：決策樹（二）——sklearn決策樹調參決策樹算法決策樹算法決策樹算法決策樹算法決策樹算法決策樹算法