從百度雲課堂上截圖的基礎概念,如果之前不了解的可以先看一下這篇博客:https://blog.csdn.net/weixin_30708329/article/details/97262409




不同的數據集訓練不同的模型,根據模型進行投票得到最終預測結果

多棵決策樹組成森林,每個模型訓練集不同和選擇的決策屬性不同是RF算法隨機的最主要體現



adaboost算法不同模型之間會有影響







多層分類器進行結果的預測



bagging算法提高KNN和決策樹算法精確度
1 # 導入算法包以及數據集 2 from sklearn import neighbors 3 from sklearn import datasets 4 from sklearn.ensemble import BaggingClassifier 5 from sklearn import tree 6 from sklearn.model_selection import train_test_split 7 import numpy as np 8 import matplotlib.pyplot as plt 9 iris = datasets.load_iris() 10 x_data = iris.data[:,:2] 11 y_data = iris.target 12 def plot(model): 13 # 獲取數據值所在的范圍 14 x_min, x_max = x_data[:, 0].min() - 1, x_data[:, 0].max() + 1 15 y_min, y_max = x_data[:, 1].min() - 1, x_data[:, 1].max() + 1 16 17 # 生成網格矩陣 18 xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.02), 19 np.arange(y_min, y_max, 0.02)) 20 21 z = model.predict(np.c_[xx.ravel(), yy.ravel()])# ravel與flatten類似,多維數據轉一維。flatten不會改變原始數據,ravel會改變原始數據 22 z = z.reshape(xx.shape) 23 # 等高線圖 24 cs = plt.contourf(xx, yy, z) 25 x_train,x_test,y_train,y_test = train_test_split(x_data, y_data) 26 knn = neighbors.KNeighborsClassifier() 27 knn.fit(x_train, y_train) 28 # 畫圖 29 plot(knn) 30 # 樣本散點圖 31 plt.scatter(x_data[:, 0], x_data[:, 1], c=y_data) 32 plt.show() 33 # 准確率 34 print("knn:",knn.score(x_test, y_test)) 35 dtree = tree.DecisionTreeClassifier() 36 dtree.fit(x_train, y_train) 37 # 畫圖 38 plot(dtree) 39 # 樣本散點圖 40 plt.scatter(x_data[:, 0], x_data[:, 1], c=y_data) 41 plt.show() 42 # 准確率 43 print("dtree:",dtree.score(x_test, y_test)) 44 bagging_knn = BaggingClassifier(knn, n_estimators=100)#使用bagging算法訓練100組knn分類器 45 # 輸入數據建立模型 46 bagging_knn.fit(x_train, y_train) 47 plot(bagging_knn) 48 # 樣本散點圖 49 plt.scatter(x_data[:, 0], x_data[:, 1], c=y_data) 50 plt.show() 51 print("bagging:",bagging_knn.score(x_test, y_test)) 52 bagging_tree = BaggingClassifier(dtree, n_estimators=100)#使用bagging算法訓練100組決策樹分類器 53 # 輸入數據建立模型 54 bagging_tree.fit(x_train, y_train) 55 plot(bagging_tree) 56 # 樣本散點圖 57 plt.scatter(x_data[:, 0], x_data[:, 1], c=y_data) 58 plt.show() 59 print("bagging_tree",bagging_tree.score(x_test, y_test))
隨機森林對決策樹模型優化
1 from sklearn import tree 2 from sklearn.model_selection import train_test_split 3 from sklearn.ensemble import RandomForestClassifier 4 import numpy as np 5 import matplotlib.pyplot as plt 6 # 載入數據 7 data = np.genfromtxt("LR-testSet2.txt", delimiter=",") 8 x_data = data[:,:-1] 9 y_data = data[:,-1] 10 11 plt.scatter(x_data[:,0],x_data[:,1],c=y_data) 12 plt.show() 13 x_train,x_test,y_train,y_test = train_test_split(x_data, y_data, test_size = 0.5) 14 def plot(model): 15 # 獲取數據值所在的范圍 16 x_min, x_max = x_data[:, 0].min() - 1, x_data[:, 0].max() + 1 17 y_min, y_max = x_data[:, 1].min() - 1, x_data[:, 1].max() + 1 18 19 # 生成網格矩陣 20 xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.02), 21 np.arange(y_min, y_max, 0.02)) 22 23 z = model.predict(np.c_[xx.ravel(), yy.ravel()])# ravel與flatten類似,多維數據轉一維。flatten不會改變原始數據,ravel會改變原始數據 24 z = z.reshape(xx.shape) 25 # 等高線圖 26 cs = plt.contourf(xx, yy, z) 27 # 樣本散點圖 28 plt.scatter(x_test[:, 0], x_test[:, 1], c=y_test) 29 plt.show() 30 dtree = tree.DecisionTreeClassifier() 31 dtree.fit(x_train, y_train) 32 plot(dtree) 33 print("dtree",dtree.score(x_test, y_test)) 34 RF = RandomForestClassifier(n_estimators=50) 35 RF.fit(x_train, y_train) 36 plot(RF) 37 print("RandomForest",RF.score(x_test, y_test))
下面是用到的數據
1 0.051267,0.69956,1 2 -0.092742,0.68494,1 3 -0.21371,0.69225,1 4 -0.375,0.50219,1 5 -0.51325,0.46564,1 6 -0.52477,0.2098,1 7 -0.39804,0.034357,1 8 -0.30588,-0.19225,1 9 0.016705,-0.40424,1 10 0.13191,-0.51389,1 11 0.38537,-0.56506,1 12 0.52938,-0.5212,1 13 0.63882,-0.24342,1 14 0.73675,-0.18494,1 15 0.54666,0.48757,1 16 0.322,0.5826,1 17 0.16647,0.53874,1 18 -0.046659,0.81652,1 19 -0.17339,0.69956,1 20 -0.47869,0.63377,1 21 -0.60541,0.59722,1 22 -0.62846,0.33406,1 23 -0.59389,0.005117,1 24 -0.42108,-0.27266,1 25 -0.11578,-0.39693,1 26 0.20104,-0.60161,1 27 0.46601,-0.53582,1 28 0.67339,-0.53582,1 29 -0.13882,0.54605,1 30 -0.29435,0.77997,1 31 -0.26555,0.96272,1 32 -0.16187,0.8019,1 33 -0.17339,0.64839,1 34 -0.28283,0.47295,1 35 -0.36348,0.31213,1 36 -0.30012,0.027047,1 37 -0.23675,-0.21418,1 38 -0.06394,-0.18494,1 39 0.062788,-0.16301,1 40 0.22984,-0.41155,1 41 0.2932,-0.2288,1 42 0.48329,-0.18494,1 43 0.64459,-0.14108,1 44 0.46025,0.012427,1 45 0.6273,0.15863,1 46 0.57546,0.26827,1 47 0.72523,0.44371,1 48 0.22408,0.52412,1 49 0.44297,0.67032,1 50 0.322,0.69225,1 51 0.13767,0.57529,1 52 -0.0063364,0.39985,1 53 -0.092742,0.55336,1 54 -0.20795,0.35599,1 55 -0.20795,0.17325,1 56 -0.43836,0.21711,1 57 -0.21947,-0.016813,1 58 -0.13882,-0.27266,1 59 0.18376,0.93348,0 60 0.22408,0.77997,0 61 0.29896,0.61915,0 62 0.50634,0.75804,0 63 0.61578,0.7288,0 64 0.60426,0.59722,0 65 0.76555,0.50219,0 66 0.92684,0.3633,0 67 0.82316,0.27558,0 68 0.96141,0.085526,0 69 0.93836,0.012427,0 70 0.86348,-0.082602,0 71 0.89804,-0.20687,0 72 0.85196,-0.36769,0 73 0.82892,-0.5212,0 74 0.79435,-0.55775,0 75 0.59274,-0.7405,0 76 0.51786,-0.5943,0 77 0.46601,-0.41886,0 78 0.35081,-0.57968,0 79 0.28744,-0.76974,0 80 0.085829,-0.75512,0 81 0.14919,-0.57968,0 82 -0.13306,-0.4481,0 83 -0.40956,-0.41155,0 84 -0.39228,-0.25804,0 85 -0.74366,-0.25804,0 86 -0.69758,0.041667,0 87 -0.75518,0.2902,0 88 -0.69758,0.68494,0 89 -0.4038,0.70687,0 90 -0.38076,0.91886,0 91 -0.50749,0.90424,0 92 -0.54781,0.70687,0 93 0.10311,0.77997,0 94 0.057028,0.91886,0 95 -0.10426,0.99196,0 96 -0.081221,1.1089,0 97 0.28744,1.087,0 98 0.39689,0.82383,0 99 0.63882,0.88962,0 100 0.82316,0.66301,0 101 0.67339,0.64108,0 102 1.0709,0.10015,0 103 -0.046659,-0.57968,0 104 -0.23675,-0.63816,0 105 -0.15035,-0.36769,0 106 -0.49021,-0.3019,0 107 -0.46717,-0.13377,0 108 -0.28859,-0.060673,0 109 -0.61118,-0.067982,0 110 -0.66302,-0.21418,0 111 -0.59965,-0.41886,0 112 -0.72638,-0.082602,0 113 -0.83007,0.31213,0 114 -0.72062,0.53874,0 115 -0.59389,0.49488,0 116 -0.48445,0.99927,0 117 -0.0063364,0.99927,0 118 0.63265,-0.030612,0
adaboost決策樹
1 import numpy as np 2 import matplotlib.pyplot as plt 3 from sklearn import tree 4 from sklearn.ensemble import AdaBoostClassifier 5 from sklearn.tree import DecisionTreeClassifier 6 from sklearn.datasets import make_gaussian_quantiles 7 from sklearn.metrics import classification_report 8 # 生成2維正態分布,生成的數據按分位數分為兩類,500個樣本,2個樣本特征 9 x1, y1 = make_gaussian_quantiles(n_samples=500, n_features=2,n_classes=2)#默認mean=(0, 0) 10 # 生成2維正態分布,生成的數據按分位數分為兩類,400個樣本,2個樣本特征均值都為3 11 x2, y2 = make_gaussian_quantiles(mean=(3, 3), n_samples=500, n_features=2, n_classes=2) 12 # 將兩組數據合成一組數據 13 x_data = np.concatenate((x1, x2)) 14 y_data = np.concatenate((y1, - y2 + 1)) 15 plt.scatter(x_data[:, 0], x_data[:, 1], c=y_data) 16 plt.show() 17 # 決策樹模型 18 model = tree.DecisionTreeClassifier(max_depth=3) 19 20 # 輸入數據建立模型 21 model.fit(x_data, y_data) 22 23 # 獲取數據值所在的范圍 24 x_min, x_max = x_data[:, 0].min() - 1, x_data[:, 0].max() + 1 25 y_min, y_max = x_data[:, 1].min() - 1, x_data[:, 1].max() + 1 26 27 # 生成網格矩陣 28 xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.02), 29 np.arange(y_min, y_max, 0.02)) 30 31 z = model.predict(np.c_[xx.ravel(), yy.ravel()])# ravel與flatten類似,多維數據轉一維。flatten不會改變原始數據,ravel會改變原始數據 32 z = z.reshape(xx.shape) 33 # 等高線圖 34 cs = plt.contourf(xx, yy, z) 35 # 樣本散點圖 36 plt.scatter(x_data[:, 0], x_data[:, 1], c=y_data) 37 plt.show() 38 # 模型准確率 39 print("決策樹:",model.score(x_data,y_data)) 40 # AdaBoost模型 41 model = AdaBoostClassifier(DecisionTreeClassifier(max_depth=3),n_estimators=10)#決策樹深度為3,一共訓練十個模型 42 # 訓練模型 43 model.fit(x_data, y_data) 44 45 # 獲取數據值所在的范圍 46 x_min, x_max = x_data[:, 0].min() - 1, x_data[:, 0].max() + 1 47 y_min, y_max = x_data[:, 1].min() - 1, x_data[:, 1].max() + 1 48 49 # 生成網格矩陣 50 xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.02), 51 np.arange(y_min, y_max, 0.02)) 52 53 # 獲取預測值 54 z = model.predict(np.c_[xx.ravel(), yy.ravel()]) 55 z = z.reshape(xx.shape) 56 # 等高線圖 57 cs = plt.contourf(xx, yy, z) 58 # 樣本散點圖 59 plt.scatter(x_data[:, 0], x_data[:, 1], c=y_data) 60 plt.show() 61 # 模型准確率 62 print("adaboost:",model.score(x_data,y_data))#得分很高
stacking分類器算法
1 from sklearn import datasets 2 from sklearn import model_selection 3 from sklearn.linear_model import LogisticRegression 4 from sklearn.neighbors import KNeighborsClassifier 5 from sklearn.tree import DecisionTreeClassifier 6 from mlxtend.classifier import StackingClassifier # pip install mlxtend 7 import numpy as np 8 9 # 載入數據集 10 iris = datasets.load_iris() 11 # 只要第1,2列的特征 12 x_data, y_data = iris.data[:, 1:3], iris.target 13 14 # 定義三個不同的分類器 15 clf1 = KNeighborsClassifier(n_neighbors=1) 16 clf2 = DecisionTreeClassifier() 17 clf3 = LogisticRegression() 18 19 # 定義一個次級分類器 20 lr = LogisticRegression() 21 sclf = StackingClassifier(classifiers=[clf1, clf2, clf3], 22 meta_classifier=lr) 23 24 for clf, label in zip([clf1, clf2, clf3, sclf], 25 ['KNN', 'Decision Tree', 'LogisticRegression', 'StackingClassifier']): 26 scores = model_selection.cross_val_score(clf, x_data, y_data, cv=3, scoring='accuracy') 27 print("Accuracy: %0.2f [%s]" % (scores.mean(), label))
voting分類器
1 from sklearn import datasets 2 from sklearn import model_selection 3 from sklearn.linear_model import LogisticRegression 4 from sklearn.neighbors import KNeighborsClassifier 5 from sklearn.tree import DecisionTreeClassifier 6 from sklearn.ensemble import VotingClassifier 7 import numpy as np 8 9 # 載入數據集 10 iris = datasets.load_iris() 11 # 只要第1,2列的特征 12 x_data, y_data = iris.data[:, 1:3], iris.target 13 14 # 定義三個不同的分類器 15 clf1 = KNeighborsClassifier(n_neighbors=1) 16 clf2 = DecisionTreeClassifier() 17 clf3 = LogisticRegression() 18 19 sclf = VotingClassifier([('knn', clf1), ('dtree', clf2), ('lr', clf3)]) 20 21 for clf, label in zip([clf1, clf2, clf3, sclf], 22 ['KNN', 'Decision Tree', 'LogisticRegression', 'VotingClassifier']): 23 scores = model_selection.cross_val_score(clf, x_data, y_data, cv=3, scoring='accuracy') 24 print("Accuracy: %0.2f [%s]" % (scores.mean(), label))
