sklearn同時運行多個模型並進行可視化


參考:https://blog.csdn.net/qq_34106574/article/details/82016442

import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_moons, make_circles, make_classification
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis as QDA

%matplotlib inline
 
h = .02  # step size in the mesh
#最近鄰、線性支持向量機、RBF支持向量機、決策樹、隨機森林、AdaBoost、朴素貝葉斯、LDA、QDA 
names = ["Nearest Neighbors", "Linear SVM", "RBF SVM", "Decision Tree",
         "Random Forest", "AdaBoost", "Naive Bayes", "LDA", "QDA"]
classifiers = [
    KNeighborsClassifier(3),
    SVC(kernel="linear", C=0.025),
    SVC(gamma=2, C=1),
    DecisionTreeClassifier(max_depth=5),
    RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
    AdaBoostClassifier(),
    GaussianNB(),
    LDA(),
    QDA()]
"""
make_classification用於生成數據樣本
n_features:特征個數=n_informative()+n_redundant+n_repeated
n_informative:多信息特征的個數
n_redundant:冗余信息,informative特征的隨機線性組合
n_repeated:重復信息,隨機提取n_informative和n_redundant特征
n_classes:分類類別
n_clusters_per_class:某一個類別是由幾個cluster構成的
weights:列表類型,權重比
class_sep:乘以超立方體大小的因子。較大的值分散了簇/類,並使分類任務更容易。默認為1
random_state: 
如果是int,random_state是隨機數發生器使用的種子; 
如果RandomState實例,random_state是隨機數生成器; 
如果沒有,則隨機數生成器是np.random使用的RandomState實例。
返回值:
X:形狀數組[n_samples,n_features]
生成的樣本。
y:形狀數組[n_samples]
每個樣本的類成員的整數標簽。
"""
X, y = make_classification(n_features=2, n_redundant=0, n_informative=2,
                           random_state=1, n_clusters_per_class=1)
rng = np.random.RandomState(2)
X += 2 * rng.uniform(size=X.shape)
linearly_separable = (X, y)

#生成三種形式的數據,月亮型、圓型、線性可分型 
datasets = [make_moons(noise=0.3, random_state=0),
            make_circles(noise=0.2, factor=0.5, random_state=1),
            linearly_separable
            ]
#創建一個新的圖表,參數是尺寸,單位為英寸。            
figure = plt.figure(figsize=(27, 9))
i = 1
# iterate over datasets
#遍歷數據集
for ds in datasets:
    # preprocess dataset, split into training and test part
    #取得數據集和標簽
    X, y = ds
    #StandardScaler:去均值和方差歸一化。且是針對每一個特征維度來做的,而不是針對樣本。
    X = StandardScaler().fit_transform(X)
    #划分訓練集和測試集
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.4)

    x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5
    y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5
    #np.meshgrid:從坐標向量中返回坐標矩陣
    """
    直觀理解:
    二維坐標系中,X軸可以取三個值1,2,3, Y軸可以取三個值7,8, 請問可以獲得多少個點的坐標?
    顯而易見是6個:
    (1,7)(2,7)(3,7)
    (1,8)(2,8)(3,8)
    """
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                         np.arange(y_min, y_max, h))
 
    # just plot the dataset first
    ## 繪圖庫中的顏色查找表。比如A1是紅色,A2是淺藍色。 這樣一種映射關系
    cm = plt.cm.RdBu
    cm_bright = ListedColormap(['#FF0000', '#0000FF'])
    ax = plt.subplot(len(datasets), len(classifiers) + 1, i)
    # Plot the training points
    ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm_bright)
    # and testing points
    ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=cm_bright, alpha=0.6)
    ax.set_xlim(xx.min(), xx.max())
    ax.set_ylim(yy.min(), yy.max())
    ax.set_xticks(())
    ax.set_yticks(())
    i += 1
 
    # iterate over classifiers
    for name, clf in zip(names, classifiers):
        ax = plt.subplot(len(datasets), len(classifiers) + 1, i)
        clf.fit(X_train, y_train)
        score = clf.score(X_test, y_test)
 
        # Plot the decision boundary. For that, we will assign a color to each
        # point in the mesh [x_min, m_max]x[y_min, y_max].
        if hasattr(clf, "decision_function"):
            Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()])
        else:
            Z = clf.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1]
 
        # Put the result into a color plot
        Z = Z.reshape(xx.shape)
        ax.contourf(xx, yy, Z, cmap=cm, alpha=.8)
 
        # Plot also the training points
        ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm_bright)
        # and testing points
        ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=cm_bright,
                   alpha=0.6)
 
        ax.set_xlim(xx.min(), xx.max())
        ax.set_ylim(yy.min(), yy.max())
        ax.set_xticks(())
        ax.set_yticks(())
        ax.set_title(name)
        ax.text(xx.max() - .3, yy.min() + .3, ('%.2f' % score).lstrip('0'),
                size=15, horizontalalignment='right')
        i += 1
 
figure.subplots_adjust(left=.02, right=.98)
plt.show()

結果:

將其中的准確率用表格表示:

數據 最近鄰 線性SVM RBF-SVM 決策樹 隨機森林 Adaboost 朴素貝葉斯 LDA QDA
月型 0.93 0.78 0.90 0.82 0.88 0.93 0.80 0.80 0.80
圓型 0.93 0.47 0.97 0.78 0.80 0.85 0.90 0.38 0.90
線性型 0.93 0.93 0.95 0.93 0.95 0.93 0.97 0.97 0.93

當然,這僅僅是利用簡單的模型試了試,並沒有將每一個模型的參數都調至最優,但還是能說明一些問題的:

1、在數據是線性可分的情況下,各種方法的分類准確率差別都不是很大。

2、線性分類器對非線性可分的數據無能為力。

3、最近鄰和RBF-SVM在三種數據上表現得都比較不錯。


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM