一步一步詳解ID3和C4.5的C++實現

本文轉載自查看原文 2014-11-26 12:33 4132 遞歸/ C4.5/ ID3/ Data mining/數據挖掘/ 決策樹

1. 關於ID3和C4.5的原理介紹這里不贅述，網上到處都是，可以下載講義c9641_c001.pdf或者參考李航的《統計學習方法》.

2. 數據與數據處理

本文采用下面的訓練數據：

數據處理：本文只采用了"Outlook", "Humidity", "Windy"三個屬性，然后根據Humidity的值是否大於75，將Humidity的值歸為兩類，Play Golf 的值就是類別標簽，只有yes 和 no兩類

訓練集是字符和數字的混合，這會給編程帶來麻煩，所以首先把訓練集用數字表示出來：

 1 const unsigned att_num = 3;  2 const unsigned rule_num = 14;  3 string decision_tree_name("Play Golf ?");  4 string attribute_names[] = {"Outlook", "Humidity", "Windy"};  5 string attribute_values[] = {"Sunny", "Overcast", "Rainy", "> 75", "<= 75", "True", "False", "Yes", "No"};  6 //訓練集最后一列為分類標簽，所以總列數為屬性數加1
 7 unsigned train_data[rule_num][att_num + 1] = {  8                     {0, 3, 6, 8},{0, 3, 5, 8},{1, 3, 6, 7},  9                     {2, 3, 6, 7},{2, 3, 6, 7},{2, 4, 5, 8}, 10                     {1, 4, 5, 7},{0, 3, 6, 8},{0, 4, 6, 7}, 11                     {2, 3, 6, 7},{0, 4, 5, 7},{1, 3, 5, 7}, 12                     {1, 4, 6, 7},{2, 3, 5, 8} 13                                 };

以train_data的第一行{0, 3, 6, 8}為例解釋：前三列值對應的屬性與attribute_names中的元素分別對應，最后一列是類別標簽的值，0 表示 attribute_values的第1個元素，即”Sunny“，類似3便是attribute_values的第4個元素"> 75"，6 是 "False"，8 是"No"，所以{0, 3, 6, 8} 代表的實例就是:

　　其他實例都是以這樣的方式數字化，方便編程.

3. 編寫必要函數

因為ID3和C4.5都需要計算屬性的信息增益，C4.5還需要計算屬性的信息增益比，所正確編寫這兩個函數很重要，對比着講義c9641_c001.pdf或者其他參考資料，編寫出這兩個函數.(代碼最后附上)

4. 確定數據結構

這是最重要的一環，明確目的：構造一個決策樹！這就直接決定了編程的正確或者難易，網上有很多例子，但是我覺得不夠簡潔，這里我采用一種簡單且容易理解的方式：

1 struct Tree{ 2     unsigned root;//節點屬性值
3     vector<unsigned> branches;//節點可能取值
4     vector<Tree> children; //孩子節點
5 };

每一個決策樹都是由根節點開始，然后有很多分支，分支連接着孩子節點，而每一個孩子節點以及這個孩子節點對應的所有子孫又可以組成一棵樹，這是一個不斷遞歸的過程，所以采用了上面的數據結構.

5. 構造決策樹

有了上面的基礎，開始着手構造決策樹,根據規則選出某一屬性作為根節點，根據根節點的取值確定分支，然后構造孩子節點，根據上面的陳述可以知道，每一個孩子節點及其后面的子孫又是一棵樹，所以這是一個遞歸操作，即采用前面同樣的方式來構造這個子樹，以此類推。

6. 打印決策樹

因為樹的結構是遞歸的，所以打印決策樹同樣是一個遞歸的過程。

7. 代碼實現

 1 /*************************************************  2 Copyright:1.0  3 Author:90Zeng  4 Date:2014-11-25  5 Description:ID3/C4.5 algorithm  6 **************************************************/
 7 
 8 #include <iostream>
 9 #include <cmath>
 10 #include <vector>
 11 #include <string>
 12 #include <algorithm>
 13 using namespace std;  14 
 15 
 16 const unsigned att_num = 3;  17 const unsigned rule_num = 14;  18 string decision_tree_name("Play Golf ?");  19 string attribute_names[] = {"Outlook", "Humidity", "Windy"};  20 string attribute_values[] = {"Sunny", "Overcast", "Rainy", "> 75", "<= 75", "True", "False", "Yes", "No"};  21 //訓練集最后一列為分類標簽，所以總列數為屬性數加1
 22 unsigned train_data[rule_num][att_num + 1] = {  23                     {0, 3, 6, 8},{0, 3, 5, 8},{1, 3, 6, 7},  24                     {2, 3, 6, 7},{2, 3, 6, 7},{2, 4, 5, 8},  25                     {1, 4, 5, 7},{0, 3, 6, 8},{0, 4, 6, 7},  26                     {2, 3, 6, 7},{0, 4, 5, 7},{1, 3, 5, 7},  27                     {1, 4, 6, 7},{2, 3, 5, 8}  28  };  29 
 30 
 31 
 32 
 33 /*************************************************  34 Function: unique()  35 Description: 將vector中重復元素合並，只保留一個  36 Calls: 無  37 Input: vector  38 Output: vector  39 *************************************************/
 40 template <typename T>
 41 vector<T> unique(vector<T> vals)  42 {  43     vector<T> unique_vals;  44     vector<T>::iterator itr;  45     vector<T>::iterator subitr;  46 
 47     int flag = 0;  48     while( !vals.empty() )  49  {  50         unique_vals.push_back(vals[0]);  51         itr = vals.begin();  52         subitr = unique_vals.begin() + flag;  53         while ( itr != vals.end())  54  {  55             if (*subitr == *itr)  56                 itr = vals.erase(itr);  57             else
 58                 itr++;  59  }  60         flag++;  61  }  62     return unique_vals;  63 }  64 
 65 /*************************************************  66 Function: log2()  67 Description: 計算一個數值得以2為底的對數  68 Calls: 無  69 Input: double  70 Output: double  71 *************************************************/
 72 
 73 double log2(double n)  74 {  75     return log10(n) / log10(2.0);  76 }  77 
 78 /*************************************************  79 Function: compute_entropy()  80 Description: 根據屬性的取值，計算該屬性的熵  81 Calls: unique(),log2(),count(),其中count()  82  在STL的algorithm庫中  83 Input: vector<unsigned>  84 Output: double  85 *************************************************/
 86 double compute_entropy(vector<unsigned> v)  87 {  88     vector<unsigned> unique_v;  89     unique_v = unique(v);  90 
 91     vector<unsigned>::iterator itr;  92     itr = unique_v.begin();  93 
 94     double entropy = 0.0;  95     auto total = v.size();  96     while(itr != unique_v.end())  97  {  98         double cnt = count(v.begin(), v.end(), *itr);  99         entropy -= cnt / total * log2(cnt / total); 100         itr++; 101  } 102     return entropy; 103 } 104 
105 /************************************************* 106 Function: compute_gain() 107 Description: 計算數據集中所有屬性的信息增益 108 Calls: compute_entropy(),unique() 109 Input: vector<vector<unsigned> > 110  相當於一個二維數組，存儲着訓練數據集 111 Output: vector<double> 存儲着所有屬性的信息 112  增益 113 *************************************************/
114 vector<double> compute_gain(vector<vector<unsigned> > truths) 115 { 116     vector<double> gain(truths[0].size() - 1, 0); 117     vector<unsigned> attribute_vals; 118     vector<unsigned> labels; 119     for(unsigned j = 0; j < truths.size(); j++) 120  { 121  labels.push_back(truths[j].back()); 122  } 123 
124     for(unsigned i = 0; i < truths[0].size() - 1; i++)//最后一列是類別標簽，沒必要計算信息增益
125  { 126         for(unsigned j = 0; j < truths.size(); j++) 127  attribute_vals.push_back(truths[j][i]); 128 
129         vector<unsigned> unique_vals = unique(attribute_vals); 130         vector<unsigned>::iterator itr = unique_vals.begin(); 131         vector<unsigned> subset; 132         while(itr != unique_vals.end()) 133  { 134             for(unsigned k = 0; k < truths.size(); k++) 135  { 136                 if (*itr == attribute_vals[k]) 137  { 138  subset.push_back(truths[k].back()); 139  } 140  } 141             double A = (double)subset.size(); 142             gain[i] += A / truths.size() * compute_entropy(subset); 143             itr++; 144  subset.clear(); 145  } 146         gain[i] = compute_entropy(labels) - gain[i]; 147  attribute_vals.clear(); 148  } 149     return gain; 150 } 151 
152 /************************************************* 153 Function: compute_gain_ratio() 154 Description: 計算數據集中所有屬性的信息增益比 155  C4.5算法中用到 156 Calls: compute_gain();compute_entropy() 157 Input: 訓練數據集 158 Output: 信息增益比 159 *************************************************/
160 vector<double> compute_gain_ratio(vector<vector<unsigned> > truths) 161 { 162     vector<double> gain = compute_gain(truths); 163     vector<double> entropies; 164     vector<double> gain_ratio; 165     
166     for(unsigned i = 0; i < truths[0].size() - 1; i++)//最后一列是類別標簽，沒必要計算信息增益比
167  { 168         vector<unsigned> attribute_vals(truths.size(), 0); 169         for(unsigned j = 0; j < truths.size(); j++) 170  { 171             attribute_vals[j] = truths[j][i]; 172  } 173         double current_entropy = compute_entropy(attribute_vals); 174         if (current_entropy) 175  { 176             gain_ratio.push_back(gain[i] / current_entropy); 177  } 178         else
179             gain_ratio.push_back(0.0); 180         
181  } 182     return gain_ratio; 183 } 184 
185 /************************************************* 186 Function: find_most_common_label() 187 Description: 找出數據集中最多的類別標簽 188 
189 Calls: count(); 190 Input: 數據集 191 Output: 類別標簽 192 *************************************************/
193 template <typename T>
194 T find_most_common_label(vector<vector<T> > data) 195 { 196     vector<T> labels; 197     for (unsigned i = 0; i < data.size(); i++) 198  { 199  labels.push_back(data[i].back()); 200  } 201     vector<T>:: iterator itr = labels.begin(); 202  T most_common_label; 203     unsigned most_counter = 0; 204     while (itr != labels.end()) 205  { 206         unsigned current_counter = count(labels.begin(), labels.end(), *itr); 207         if (current_counter > most_counter) 208  { 209             most_common_label = *itr; 210             most_counter = current_counter; 211  } 212         itr++; 213  } 214     return most_common_label; 215 } 216 
217 /************************************************* 218 Function: find_attribute_values() 219 Description: 根據屬性，找出該屬性可能的取值 220 
221 Calls: unique(); 222 Input: 屬性，數據集 223 Output: 屬性所有可能的取值(不重復) 224 *************************************************/
225 template <typename T>
226 vector<T> find_attribute_values(T attribute, vector<vector<T> > data) 227 { 228     vector<T> values; 229     for (unsigned i = 0; i < data.size(); i++) 230  { 231  values.push_back(data[i][attribute]); 232  } 233     return unique(values); 234 } 235 
236 /************************************************* 237 Function: drop_one_attribute() 238 Description: 在構建決策樹的過程中，如果某一屬性已經考察過了 239  那么就從數據集中去掉這一屬性，此處不是真正意義 240  上的去掉，而是將考慮過的屬性全部標記為110，當 241  然可以是其他數字，只要能和原來訓練集中的任意數 242  字區別開來即可 243 Calls: unique(); 244 Input: 屬性，數據集 245 Output: 屬性所有可能的取值(不重復) 246 *************************************************/
247 template <typename T>
248 vector<vector<T> > drop_one_attribute(T attribute, vector<vector<T> > data) 249 { 250     vector<vector<T> > new_data(data.size(),vector<T>(data[0].size() - 1, 0)); 251     for (unsigned i = 0; i < data.size(); i++) 252  { 253         data[i][attribute] = 110; 254  } 255     return data; 256 } 257 
258 
259 struct Tree{ 260     unsigned root;//節點屬性值
261     vector<unsigned> branches;//節點可能取值
262     vector<Tree> children; //孩子節點
263 }; 264 
265 /************************************************* 266 Function: build_decision_tree() 267 Description: 遞歸構建決策樹 268                 
269 Calls: unique()，count(), 270  find_most_common_label() 271  compute_gain()(ID3), 272  compute_gain_ratio()(C4.5), 273  find_attribute_values(), 274  drop_one_attribute(), 275  build_decision_tree()(遞歸， 276  當然要調用函數本身) 277 Input: 訓練數據集，一個空決策樹 278 Output: 無 279 *************************************************/
280 void build_decision_tree(vector<vector<unsigned> > examples, Tree &tree) 281 { 282     //第一步：判斷所有實例是否都屬於同一類，如果是，則決策樹是單節點
283     vector<unsigned> labels(examples.size(), 0); 284     for (unsigned i = 0; i < examples.size(); i++) 285  { 286         labels[i] = examples[i].back(); 287  } 288     if (unique(labels).size() == 1) 289  { 290         tree.root = labels[0]; 291         return; 292  } 293 
294     //第二步：判斷是否還有剩余的屬性沒有考慮，如果所有屬性都已經考慮過了， 295     //那么此時屬性數量為0，將訓練集中最多的類別標記作為該節點的類別標記
296     if (count(examples[0].begin(),examples[0].end(),110) == examples[0].size() - 1)//只剩下一列類別標記
297  { 298         tree.root = find_most_common_label(examples); 299         return; 300  } 301     //第三步:在上面兩步的條件都判斷失敗后，計算信息增益，選擇信息增益最大 302     //的屬性作為根節點,並找出該節點的所有取值
303 
304     vector<double> standard = compute_gain(examples); 305 
306     //要是采用C4.5，將上面一行注釋掉，把下面一行的注釋去掉即可 307     //vector<double> standard = compute_gain_ratio(examples);
308     tree.root = 0; 309     for (unsigned i = 0; i < standard.size(); i++) 310  { 311         if (standard[i] >= standard[tree.root] && examples[0][i] != 110) 312             tree.root  = i; 313  } 314 
315 
316     tree.branches = find_attribute_values(tree.root, examples); 317     //第四步:根據節點的取值，將examples分成若干子集
318     vector<vector<unsigned> > new_examples = drop_one_attribute(tree.root, examples); 319     vector<vector<unsigned> > subset; 320     for (unsigned i = 0; i < tree.branches.size(); i++) 321  { 322         for (unsigned j = 0; j < examples.size(); j++) 323  { 324             for (unsigned k = 0; k < examples[0].size(); k++) 325  { 326                 if (tree.branches[i] == examples[j][k]) 327  subset.push_back(new_examples[j]); 328  } 329  } 330         // 第五步:對每一個子集遞歸調用build_decision_tree()函數
331  Tree new_tree; 332  build_decision_tree(subset,new_tree); 333  tree.children.push_back(new_tree); 334  subset.clear(); 335  } 336 } 337 
338 /************************************************* 339 Function: print_tree() 340 Description: 從第根節點開始，逐層將決策樹輸出到終 341  端顯示 342 
343 Calls: print_tree(); 344 Input: 決策樹，層數 345 Output: 無 346 *************************************************/
347 void print_tree(Tree tree,unsigned depth) 348 { 349     for (unsigned d = 0; d < depth; d++) cout << "\t"; 350     if (!tree.branches.empty()) //不是葉子節點
351  { 352         cout << attribute_names[tree.root] << endl; 353         
354         for (unsigned i = 0; i < tree.branches.size(); i++) 355  { 356             for (unsigned d = 0; d < depth + 1; d++) cout << "\t"; 357             cout << attribute_values[tree.branches[i]] << endl; 358             print_tree(tree.children[i],depth + 2); 359  } 360  } 361     else //是葉子節點
362  { 363         cout << attribute_values[tree.root] << endl; 364  } 365         
366 } 367 
368 
369 int main() 370 { 371     vector<vector<unsigned> > rules(rule_num, vector<unsigned>(att_num + 1, 0)); 372     for(unsigned i = 0; i < rule_num; i++) 373  { 374         for(unsigned j = 0; j <= att_num; j++) 375             rules[i][j] = train_data[i][j]; 376  } 377  Tree tree; 378  build_decision_tree(rules, tree); 379     cout << decision_tree_name << endl; 380     print_tree(tree,0); 381     return 0; 382 }

8.運行結果：

前者是采用ID3運行的結果，后者是講義c9641_c001.pdf給出的構造的決策樹，二者一致，驗證了程序的正確性.

9.總結

所謂”百鳥在林，不如一鳥在手“, ID3和C4.5的思想都很簡單，容易理解，但是在實現的的過程中由於數據結構的確定和遞歸調用等問題，還是調試了很久，收獲很多，實踐出真知！

免責聲明！

本站轉載的文章為個人學習借鑒使用，本站對版權不負任何法律責任。如果侵犯了您的隱私權益，請聯系本站郵箱yoyou2525@163.com刪除。

猜您在找 C4.5和ID3的差別 ID3和C4.5的理論和應用 C#一步一步實現插件框架的示例(四) c#一步一步實現ORM C#一步一步實現插件框架的示例(一) C#一步一步實現插件框架的示例(二) C#一步一步實現插件框架的示例(三) 決策樹(ID3、C4.5、CART) ID3、C4.5、CART、RandomForest的原理決策樹算法原理(ID3，C4.5)