sklearn中模型評估和預測


一、模型驗證方法如下:

  1. 通過交叉驗證得分:model_sleection.cross_val_score(estimator,X)
  2. 對每個輸入數據點產生交叉驗證估計:model_selection.cross_val_predict(estimator,X)
  3. 計算並繪制模型的學習率曲線:model_selection.learning_curve(estimator,X,y)
  4. 計算並繪制模型的驗證曲線:model_selection.validation(estimator,...)
  5. 通過排序評估交叉驗證的得分在重要性:model_selection.permutation_test_score(...)

①通過交叉驗證得分:model_sleection.cross_val_score(estimator,X)

復制代碼
import numpy as np
from sklearn.model_selection import cross_val_score
from sklearn import datasets,svm
digits=datasets.load_digits()
X=digits.data
y=digits.target
svc=svm.SVC(kernel='linear')
C_s=np.logspace(-10,0,10)
print("參數列表長度",len(C_s))
scores=list()
scores_std=list()
n_folds=3
for C in C_s:
    svc.C=C
    this_scores=cross_val_score(svc,X,y,cv=n_folds,n_jobs=1)
    #print(this_scores)
    scores.append(np.mean(this_scores))
    scores_std.append(np.std(this_scores))

#繪制交叉驗證的曲線
import matplotlib.pyplot as plt
plt.figure(1,figsize=(4,3))
plt.clf()
plt.semilogx(C_s,scores)
plt.semilogx(C_s,np.array(scores)+np.array(scores_std),'b--')
plt.semilogx(C_s,np.array(scores)-np.array(scores_std),'b--')
locs,labels=plt.yticks()
plt.yticks(locs,list(map(lambda x:"%g" %x,locs)))
plt.ylabel("CV score")
plt.xlabel("Parameter C")
plt.ylim(0,1.1)
plt.show()
復制代碼

結果圖

 

②對每個輸入數據點產生交叉驗證估計:model_selection.cross_val_predict(estimator,X)

復制代碼
from sklearn import datasets,linear_model
from sklearn.model_selection import cross_val_predict
disbetes=datasets.load_diabetes()
X=disbetes.data[:150]
y=disbetes.target[:150]
lasso=linear_model.Lasso()
y_pred=cross_val_predict(lasso,X,y)
print(y_pred)

結果:
[ 174.26933996  117.6539241   164.60228641  155.65049088  132.68647979
  128.49511245  120.76146877  141.069413    164.18904498  182.37394949
  111.04181265  127.94311443  135.0869234   162.83066014  135.3573514
  157.64516523  178.95843326  163.3919841   143.85237903  144.29748882
  133.58117218  124.77928571  132.90918003  208.52927     153.61908967
  154.16616341  118.95351821  163.50467541  145.89406196  168.3308101
  155.87411031  123.45960148  185.70459144  133.38468582  117.2789469
  150.27895019  174.1541028   160.03235091  192.31389633  161.58568256
  154.2224809   119.35517679  146.15706413  133.82056934  179.68118754
  137.96619936  146.07788398  126.77579723  123.32101099  166.26710247
  146.41559964  161.67261029  147.47731459  138.44595305  144.85421048
  113.77990664  185.54970402  115.31624749  142.23672103  171.07792136
  132.5394716   177.80524864  116.5616502   134.25230846  142.88707475
  173.2830912   154.31273504  149.16680759  144.88238997  121.97783103
  110.38457621  180.25559631  199.06141058  151.1195546   161.14217698
  153.96960812  150.77179755  113.30903579  165.15755771  115.85735727
  174.19267171  150.12027233  115.47891783  153.38967232  115.31573467
  156.49909623   92.62211515  178.15649994  131.59320715  134.46166754
  116.97678633  190.00790119  166.01173292  126.25944471  134.29256991
  144.71971963  190.9769591   182.39199466  154.45325308  148.30325558
  151.72036937  124.12825466  138.6011155   137.75891286  123.0917243
  131.74735403  112.07367481  124.56956904  156.78432061  128.63135591
   93.68260079  130.54324394  131.8693231   154.5708257   179.81343019
  165.78130755  150.04779033  162.37974736  143.92996797  143.15645843
  125.20161377  145.99590279  155.3505536   145.97574185  134.66120515
  163.92450638  101.92329396  139.33014324  122.71377023  152.20573113
  153.36931089  116.76545147  131.96936127  109.74817383  132.57453994
  159.38030328  109.31343881  147.69926269  156.3664255   161.12509958
  128.16523686  156.78446286  154.04375702  124.83705022  143.85606595
  143.23651701  147.76316913  154.21572891  129.07895017  157.79644923]
復制代碼

③、計算並繪制模型的學習率曲線:model_selection.learning_curve(estimator,X,y)

 

復制代碼
import numpy as np
import matplotlib.pyplot as plt
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.datasets import load_digits
from sklearn.model_selection import learning_curve
from sklearn.model_selection import ShuffleSplit

def plt_learning_curve(estimator,title,X,y,ylim=None,cv=None,n_jobs=1,train_size=np.linspace(.1,1.0,5)):
    plt.figure()
    plt.title(title)
    if ylim is not None:
        plt.ylim(*ylim)
    plt.xlabel("Training examples")
    plt.ylabel("Score")
    train_sizes,train_scores,test_scores=learning_curve(
        estimator,X,y,cv=cv,n_jobs=n_jobs,train_sizes=train_size)
    train_scores_mean=np.mean(train_scores,axis=1)
    train_scores_std=np.std(train_scores,axis=1)
    test_scores_mean=np.mean(test_scores,axis=1)
    test_scores_std=np.std(test_scores,axis=1)
    plt.grid()
    plt.fill_between(train_sizes,train_scores_mean-train_scores_std,train_scores_mean+train_scores_std,alpha=0.1,color="r")
    plt.fill_between(train_sizes,test_scores_mean-test_scores_std,test_scores_mean+test_scores_std,alpha=0.1,color="g")
    plt.plot(train_sizes,train_scores_mean,"o-",color="r",label="Training score")
    plt.plot(train_sizes,test_scores_mean,"o-",color="g",label="Cross-validation score")

    plt.legend(loc="best")
    return plt

digits=load_digits()
X,y=digits.data,digits.target
title="Learning Curves(Nativr Bayes)"

cv=ShuffleSplit(n_splits=100,test_size=0.2,random_state=0)
estimator=GaussianNB()
plt_learning_curve(estimator,title,X,y,ylim=(0.7,1.0),cv=cv,n_jobs=1)
title="Learnming Curves (SVM,RBF kernel,$\gamma=0.001$)"
cv=ShuffleSplit(n_splits=10,test_size=0.2,random_state=0)
estimator=SVC(gamma=0.001)
plt_learning_curve(estimator,title,X,y,(0.7,1.01),cv=cv,n_jobs=1)
plt.show()
復制代碼

④、計算並繪制模型的驗證曲線:model_selection.validation(estimator,...)

復制代碼
import matplotlib.pyplot as plt
import numpy as np
from sklearn.datasets import load_digits
from sklearn.svm import SVC
from sklearn.model_selection import validation_curve
digits = load_digits()
param_range=np.logspace(-6,-1,5)
train_scores,test_scores=validation_curve(SVC(),X,y,param_name="gamma",param_range=param_range,
                                          cv=10,scoring="accuracy",n_jobs=1)
train_scores_mean=np.mean(train_scores,axis=1)
train_scores_std=np.std(train_scores,axis=1)
test_scores_mean=np.mean(test_scores,axis=1)
test_scores_std=np.std(test_scores,axis=1)

plt.title("Validation Curve with SVM")
plt.xlabel("$\gamma$")
plt.ylabel("Score")
plt.ylim(0.0,1.1)
lw=2
plt.semilogx(param_range,train_scores_mean,label="Training score",color="darkorange",lw=lw)
plt.fill_between(param_range,train_scores_mean-train_scores_std,train_scores_mean+train_scores_std,
                 alpha=0.2,color="darkorange",lw=lw)
plt.semilogx(param_range,test_scores_mean,label="Cross-validation Score",color="navy",lw=lw)
plt.fill_between(param_range,test_scores_mean-test_scores_std,test_scores_mean+test_scores_std,
                 alpha=0.2,color="navy",lw=lw)
plt.legend(loc="best")
plt.show()
復制代碼


⑤、通過排序評估交叉驗證的得分在重要性:model_selection.permutation_test_score(...)---現在用的很少

二、模型評估方法

sklearn模型預測性能的評估方法

  • Estimator對象的score方法
  • 在交叉驗證中使用的scoring參數

Estimator對象的score方法

score(self,X,y,y_true)函數在內部會調用predict函數獲得預測響應y_predict,然后與傳人的真實響應進行比較,計算得分

使用estimator的score函數來蘋果模型的性能,默認情況下

分類器對應於准確率:sklearn.metrics.accuracy_score

回歸器對應於R2得分:sklearn.metrics.r2_score

在交叉驗證中使用scoring參數

上面的兩個模型選擇工具中都有一個參數“scoring”,該參數用來指定在進行網格搜索或計算交叉驗證得分的時候,用什么標磚度量“estimator”的預測性能。默認情況下,該參數為“None”就表示“GridSearchCV”與“cross_val_score”都會去調用“estimator”自己的“score”函數,我們也可以為“scoring”參數指定別的性能度量標准,他必須是一個可調用對象,sklearn.metric不僅為我們提供了一系列預定義的可調用對象,而且好支持自定義評估標准。

在交叉驗證中使用預定義scoring參數:

復制代碼

  #在交叉驗證中使用預定義scoring參數

from sklearn import svm,datasets
from sklearn.model_selection import cross_val_score

iris=datasets.load_iris()
X,y=iris.data,iris.target
clf=svm.SVC(probability=True,random_state=0)
print(cross_val_score(clf,X,y,scoring="neg_log_loss"))
#結果[-0.0757138  -0.16816241 -0.07091847]

model=svm.SVC() print(cross_val_score(model,X,y,scoring="wrong_choice"))
#結果:

ValueError: 'wrong_choice' is not a valid scoring value. Valid options are ['accuracy', 'adjusted_rand_score', 'average_precision', 'f1', 'f1_macro', 'f1_micro', 'f1_samples', 'f1_weighted', 'neg_log_loss', 'neg_mean_absolute_error', 'neg_mean_squared_error', 'neg_median_absolute_error', 'precision', 'precision_macro', 'precision_micro', 'precision_samples', 'precision_weighted', 'r2', 'recall', 'recall_macro', 'recall_micro', 'recall_samples', 'recall_weighted', 'roc_auc']

復制代碼

”scoring“的可用類型都存放在sklearn.metric.SCORES字典對象中

在交叉驗證中海可以使用自定義scoring參數 ,具體講解在http://www.studyai.com/course/play/9dd4fa59779d454991f55ac4c85889eb

 三、sklearn分類器評估指標總體概況

使用sklearn.metric包中的性能度量函數有:

  • 分類器性能指標
  • 回歸器性能指標
  • 聚類其性能指標
  • 兩兩距離測度

 分類器性能度量指標

 

總的來說,主要分為以下3類

  • 精度-召回率-F度量:Precision-Recall-F_measures
  • 損失函數:Loss Function
  • 接收機操作曲線:ROC Curves

只限於二分類單標簽分類問題的評估指標

  • matthews_corrcoef(y_true,y_pred[],...):計算二元分類中的Matthews相關系數(MCC)
  • precision_recall_curve(y_true,probas_pred):在不同的概率閾值下計算precision-recall點,形成曲線
  • roc_curve(y_true,y_score[,pos_label,...]):計算ROC曲線

可用於二分類多標簽分類問題的評估指標

  • average_precision_score(y_true,y_score[,...]) 計算預測得分的平均精度(mAP)
  • roc_auc_score(y_true,y_score[,average,...])計算預測得分的AUC值

可用於多分類問題的評估指標(紫色的可用於多標簽分類問題)

  • cohen_kappa_score(y1,y2[,labels,weights])
  • confusion_matrix(y_true,y_pred[,labels,...])
  • hinge_loss(y_true,pred_decision[,labels,...])
  • accuracy_score(y_true,y_pred[,normalize,...])
  • classification_report(y_true,y_pred[,...])
  • f1_score(y_true,y_pres[,labels,...])
  • fbeta_score(y_true,,y_pres,beta[,labels,...])
  • hamming_loss(y_true,y_pres[,labels,...])
  • jaccard_similarity_score(y_true,y_pres[,...])
  • log_loss(y_true,y_pres[,eps,normalize,...])
  •  zero_one_loss(y_true,y_pres[,normalize,...])
  • precision_recall_fsconfe_support(y_true,y_pres)

多分類性能評估指標 

將二分類指標拓展到多分類或多標簽問題中:

分類器性能評估指標:

  • 接收機操作曲線Reciever Operating Curves-》可用於二分類問題
  • 解卡德指數(相似性系數)Jaccard similarity coefficient-》可用於多分類問題
  • MCC指標(相關性系數)Matthews correlation coefficient-》可用於二分類問題

四、分類器評估標准

准確率:返回被正確分類的樣本比例(default)或者數量(normalize=False)

復制代碼
#准確率
import numpy as np
from sklearn.metrics import accuracy_score
y_pred=[0,2,1,3]
y_true=[0,1,2,3]
print(accuracy_score(y_true,y_pred))
print(accuracy_score(y_true,y_pred,normalize=False))

#0.5
#2
復制代碼

混淆矩陣

復制代碼
from sklearn.metrics import confusion_matrix
y_true=[2,0,2,2,0,1]
y_pred=[0,0,2,2,0,2]
print(confusion_matrix(y_true,y_pred))

y_true=["cat","ant","cat","cat","ant","bird"]
y_pred=["ant","ant","cat","cat","ant","cat"]
print(confusion_matrix(y_true,y_pred,labels=["ant","cat","bird"]))

#[[2 0 0][0 0 1][1 0 2]]
#[[2 0 0][1 2 0][0 1 0]]
復制代碼

二元分類問題:

復制代碼
#precision-recall-F-measures
from sklearn import metrics
y_pred=[0,1,0,0]
y_true=[0,1,0,1]
print(metrics.precision_score(y_true,y_pred))
#1.0
print(metrics.recall_score(y_true,y_pred))
#0.5
print(metrics.f1_score(y_true,y_pred))
#0.666666666667
print(metrics.fbeta_score(y_true,y_pred,beta=0.5))
#0.833333333333
print(metrics.fbeta_score(y_true,y_pred,beta=1))
#0.666666666667
print(metrics.fbeta_score(y_true,y_pred,beta=2))
#0.555555555556
print(metrics.precision_recall_fscore_support(y_true,y_pred,beta=0.5))
#(array([ 0.66666667,  1.        ]), array([ 1. ,  0.5]), array([ 0.71428571,  0.83333333]), array([2, 2], dtype=int32))
import numpy as np
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import average_precision_score
y_true=np.array([0,0,1,1])
y_score=np.array([0.1,0.4,0.35,0.8])
precision,recall,threahold=precision_recall_curve(y_true,y_score)
print(precision)
#[ 0.66666667  0.5         1.          1.        ]
print(recall)
[ 1.   0.5  0.5  0. ]
print(threahold)
#[ 0.35  0.4   0.8 ]
print(average_precision_score(y_true,y_score))
#0.791666666667
復制代碼

多類別多標簽分類問題

把其中的一類看成是正類,其他所有類看成是負類,每一類都可以看作是正類是都可以產生P,R,F,此時,可以按照5中方式來組合每一個類的結果,這5種方式是:macro,weighted,micro,samples,average=None

復制代碼
from sklearn import metrics
y_true=[0,1,2,0,1,2]
y_pred=[0,2,1,0,0,1]
print(metrics.precision_score(y_true,y_pred,average="macro"))
#0.222222222222
print(metrics.recall_score(y_true,y_pred,average="micro"))
#0.333333333333
print(metrics.f1_score(y_true,y_pred,average="weighted"))
#0.266666666667
print(metrics.fbeta_score(y_true,y_pred,average="macro",beta=0.5))
#0.238095238095
print(metrics.precision_recall_fscore_support(y_true,y_pred,beta=0.5,average="None"))
#(array([ 0.66666667, 0. , 0. ]), array([ 1., 0., 0.]), array([ 0.71428571, 0. , 0. ]), array([2, 2, 2], dtype=int32))
print(metrics.recall_score(y_true,y_pred,average="micro",labels=[1,2]))
#0.0
復制代碼

 

from sklearn.metrics import classification_report
y_true=[0,1,2,0,1,2]
y_pred=[0,2,1,0,0,1]
target_names=["class0","class1","class2"]
print(classification_report(y_true,y_pred,target_names=target_names))

結果為:

precision recall f1-score support

class0 0.67 1.00 0.80 2
class1 0.00 0.00 0.00 2
class2 0.00 0.00 0.00 2

avg / total 0.22 0.33 0.27 6

Roc曲線

更多ROC曲線內容:http://v.youku.com/v_show/id_XMjcyMzg0MzgwMA==.html?spm=a2h0k.8191407.0.0&from=s1.8-1-1.2

ROC曲線只需知道true positive rate(TPR)和false positive rate(FPR),TPR,FPR被看作是分類器的某個參數的函數。

TPR定義了在全部的正樣本中,分類器找到了多少個真真的正樣本

FPR定義了在全部的負樣本中,分類器把多少負樣本錯誤的分為正樣本


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM