SKlearn - ROC and AUC - 碼上歡樂

SKlearn - ROC and AUC

本文轉載自查看原文 2020-04-13 14:47 875 機器學習框架-Sklearn

ROC、AUC 的理論知識請參考我的博客分類模型評估

本文旨在總結其在 SKlearn 中的用法

基礎用法

先看源碼

def roc_curve(y_true, y_score, pos_label=None, sample_weight=None,
              drop_intermediate=True):
    """Compute Receiver operating characteristic (ROC)
    y_true : array, shape = [n_samples]
        True binary labels. If labels are not either {-1, 1} or {0, 1}, then
        pos_label should be explicitly given.

    y_score : array, shape = [n_samples]
        Target scores, can either be probability estimates of the positive
        class, confidence values, or non-thresholded measure of decisions
        (as returned by "decision_function" on some classifiers).

    pos_label : int or str, default=None
        The label of the positive class.
        When ``pos_label=None``, if y_true is in {-1, 1} or {0, 1},
        ``pos_label`` is set to 1, otherwise an error will be raised.
        設置 label 中 哪個 label 是 正例，比如 label 為 [1， 2]，2 為正例，那 pos_label=2
        當 pos_label為 None 時，如果 y_true 為 {-1, 1} or {0, 1}， pos_label 自動被設定為 1

    sample_weight : array-like of shape (n_samples,), default=None
        Sample weights.

    drop_intermediate : boolean, optional (default=True)
        Whether to drop some suboptimal thresholds which would not appear
        on a plotted ROC curve. This is useful in order to create lighter
        ROC curves.

        .. versionadded:: 0.17
           parameter *drop_intermediate*.

    Returns
    -------
    fpr : array, shape = [>2]
        Increasing false positive rates such that element i is the false
        positive rate of predictions with score >= thresholds[i].

    tpr : array, shape = [>2]
        Increasing true positive rates such that element i is the true
        positive rate of predictions with score >= thresholds[i].

    thresholds : array, shape = [n_thresholds]
        Decreasing thresholds on the decision function used to compute
        fpr and tpr. `thresholds[0]` represents no instances being predicted
        and is arbitrarily set to `max(y_score) + 1`.
    """

然后看一個最普通的示例，包括 ROC 的計算、AUC 的計算、ROC 曲線繪制

import numpy as np
from sklearn.metrics import roc_curve, auc, roc_auc_score
import matplotlib.pyplot as plt


################################### ROC and AUC ###################################
y = np.array([1, 1, 2, 2])
scores = np.array([0.1, 0.4, 0.35, 0.8])

######## 計算 ROC ########
fpr, tpr, thresholds = roc_curve(y, scores, pos_label=2)        ### pos_label 表示 哪個 label 屬於 正例
print(fpr)      # array([0. , 0. , 0.5, 0.5, 1. ])
print(tpr)      # array([0. , 0.5, 0.5, 1. , 1. ])
print(thresholds)       # array([1.8 , 0.8 , 0.4 , 0.35, 0.1 ])

##### 解釋
## 1. 當 閾值 為 0.1 時，所有的負樣本識別為正樣本，所有的正樣本識別為正樣本，也就是 閾值太低，全部識別為正樣本了；
## 2. 當 閾值 為 0.35時，0.5的負樣本識別為正樣本，所有的正樣本識別為正樣本，太多負樣本識別為正樣本了，如果是 刷臉 取款，那你的錢會被別人取光的；
## 3. 當 閾值 為 0.4 時，0.5的負樣本識別為正樣本，0.5的正樣本識別為正樣本，感覺好差啊，什么亂七八糟的；
## 4. 當 閾值 為 0.8 時，沒有負樣本識別為正樣本，0.5的正樣本識別為正樣本，不咋的啊，如果是 刷臉 取款，至少你的錢不會被別人取走，不過你自己可能也取不出來；
## 5. 當 閾值 為 1.8 時，所有樣本都是負樣本，閾值太高了；

######## 計算 AUC ########
print(auc(fpr, tpr))            # 0.75      ### ROC 曲線下面積 AUC
print(roc_auc_score(y, scores)) # 0.75

######## 畫 ROC 曲線 ########
plt.plot(fpr, tpr)
plt.show()

輸出

EER 選擇模型閾值

ROC 用於優化模型

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_iris
from sklearn.metrics import roc_auc_score,roc_curve
import matplotlib.pyplot as plt
import numpy as np

iris = load_iris()
iris.target[iris.target==1], iris.target[iris.target==2] = 0, 1   #將iris的三類數據轉化為二類數據,labels=1與labels=0合並為0，labels=2轉化為1
x_train,x_test,y_train,y_test = train_test_split(iris.data, iris.target, test_size=0.3)

model = LogisticRegression(solver='newton-cg', multi_class='ovr')
model.fit(x_train ,y_train)
y_pre = model.predict_proba(x_test)
print('predict_proba is', y_pre)

y_0 = list(y_pre[:,1])    #取第二列數據，因為第二列概率為趨於0時分類類別為0，概率趨於1時分類類別為1
fpr, tpr, thresholds = roc_curve(y_test, y_0)
print(thresholds)
# [1.98964087e+00 9.89640873e-01 6.03375665e-01 5.68953989e-01, 4.81061404e-01 2.40418592e-01 2.24425917e-01 3.43507028e-06]

auc = roc_auc_score(y_test, y_0) #計算auc

####### 計算ks
KS_max = 0
best_thr = 0
for i in range(len(fpr)):
    if(i == 0):
        KS_max = tpr[i] - fpr[i]
        best_thr = thresholds[i]
    elif (tpr[i] - fpr[i] > KS_max):
        KS_max = tpr[i] - fpr[i]
        best_thr = thresholds[i]
print('最大KS為：',KS_max)          # 最大KS為： 1.0
print('最佳閾值為：',best_thr)      # 最佳閾值為： 0.6998150731799142

###### 畫曲線圖
plt.figure()
plt.plot(fpr, tpr)
plt.plot(fpr, tpr, 'o')
plt.plot([0, 1], [1, 0], 'r')
plt.title('$ROC curve$')
plt.show()

輸出

顯然第 3 個點(圓圈內) 離 y=-x 最近

one vs rest 多分類 ROC

每個二分類都有一個 ROC

import numpy as np
import matplotlib.pyplot as plt
from itertools import cycle

from sklearn import svm, datasets
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import label_binarize
from sklearn.multiclass import OneVsRestClassifier
from scipy import interp

# 導入鳶尾花數據集
iris = datasets.load_iris()
X = iris.data  # X.shape==(150, 4)
y = iris.target  # y.shape==(150, )

# 二進制化輸出
y = label_binarize(y, classes=[0, 1, 2])  # shape==(150, 3)
n_classes = y.shape[1]  # n_classes==3

# 添加噪音特征，使問題更困難
random_state = np.random.RandomState(0)
n_samples, n_features = X.shape  # n_samples==150, n_features==4
X = np.c_[X, random_state.randn(n_samples, 200 * n_features)]  # shape==(150, 84)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.5, random_state=0)

# 學習區分某個類與其他的類
classifier = OneVsRestClassifier(svm.SVC(kernel='linear', probability=True, random_state=random_state))
y_score = classifier.fit(X_train, y_train).decision_function(X_test)

# 為每個類別計算ROC曲線和AUC
fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(n_classes):
    fpr[i], tpr[i], _ = roc_curve(y_test[:, i], y_score[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])


plt.figure()
lw = 2
color = ['r', 'g', 'b']
for i in range(3):
    plt.plot(fpr[i], tpr[i], color=color[i], lw=lw, label='ROC curve (area = %0.2f)' % roc_auc[i])
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')

plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic example')
plt.legend(loc="lower right")
plt.show()

輸出

多分類 - 宏 ROC 微 ROC

宏 ROC 和微 ROC 好像有點繞，個人覺得參考資料中這兩個搞反了，本人做如下解釋來區分這兩個概念

宏 ROC：先讓每個二分類獨自計算，再算總的

微 ROC：先把每個二分類綜合(加)起來，再算總的

import numpy as np
import matplotlib.pyplot as plt
from itertools import cycle

from sklearn import svm, datasets
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import label_binarize
from sklearn.multiclass import OneVsRestClassifier
from scipy import interp
from scipy.interpolate import lagrange, interp1d


iris = datasets.load_iris()
X = iris.data  # X.shape==(150, 4)
y = iris.target  # y.shape==(150, )

# 二進制化輸出
y = label_binarize(y, classes=[0, 1, 2])  # shape==(150, 3)
n_classes = y.shape[1]  # n_classes==3

# 添加噪音特征，使問題更困難
random_state = np.random.RandomState(0)
n_samples, n_features = X.shape  # n_samples==150, n_features==4
X = np.c_[X, random_state.randn(n_samples, 200 * n_features)]  # shape==(150, 84)

# 打亂數據集並切分訓練集和測試集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.5, random_state=0)

classifier = OneVsRestClassifier(svm.SVC(kernel='linear', probability=True, random_state=random_state))
y_score = classifier.fit(X_train, y_train).decision_function(X_test)

# 為每個類別計算ROC曲線和AUC
fpr = dict()        ### 假正例率
tpr = dict()        ### 真正例率
roc_auc = dict()
for i in range(n_classes):
    fpr[i], tpr[i], _ = roc_curve(y_test[:, i], y_score[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])

########################### 計算宏平均ROC曲線和AUC ###########################
### 每個二分類，各自算各自的，再綜合
fpr["micro"], tpr["micro"], _ = roc_curve(y_test.ravel(), y_score.ravel())
roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])

########################### 計算微平均ROC曲線和AUC ###########################
### 先綜合每個二分類的，再綜合
# 匯總所有FPR
all_fpr = np.unique(np.concatenate([fpr[i] for i in range(n_classes)]))
print(all_fpr.shape)        # (42,)

# 然后再用這些點對ROC曲線進行插值
mean_tpr = np.zeros_like(all_fpr)
for i in range(n_classes):
    #### 把每個 二分類 結果 加起來了
    mean_tpr += interp(all_fpr, fpr[i], tpr[i])     ### 版本不同
    # f = interp1d(fpr[i], tpr[i])                  ### 這兩句和上面一句是一個作用
    # mean_tpr += f(all_fpr)

# 最后求平均並計算AUC
mean_tpr /= n_classes
print(mean_tpr)

fpr["macro"] = all_fpr
tpr["macro"] = mean_tpr
roc_auc["macro"] = auc(fpr["macro"], tpr["macro"])

########################### 繪制所有ROC曲線 ###########################
plt.figure()
lw = 2
plt.plot(fpr["micro"], tpr["micro"], label='micro-average ROC curve (area = {0:0.2f})'.format(roc_auc["micro"]), color='deeppink', linestyle=':', linewidth=4)
plt.plot(fpr["macro"], tpr["macro"], label='macro-average ROC curve (area = {0:0.2f})'.format(roc_auc["macro"]), color='navy', linestyle=':', linewidth=4)

colors = cycle(['aqua', 'darkorange', 'cornflowerblue'])
for i, color in zip(range(n_classes), colors):
    plt.plot(fpr[i], tpr[i], color=color, lw=lw, label='ROC curve of class {0} (area = {1:0.2f})'.format(i, roc_auc[i]))

plt.plot([0, 1], [0, 1], 'k--', lw=lw)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Some extension of Receiver operating characteristic to multi-class')
plt.legend(loc="lower right")
plt.show()

輸出

參考資料：

https://blog.csdn.net/hfutdog/article/details/88079934

https://www.jianshu.com/p/90106243d231

免責聲明！

本站轉載的文章為個人學習借鑒使用，本站對版權不負任何法律責任。如果侵犯了您的隱私權益，請聯系本站郵箱yoyou2525@163.com刪除。

猜您在找 sklearn學習：為什么roc_auc_score()和auc()有不同的結果？ sklearn.metrics中的confusion_matrix、ROC、AUC指標 ROC曲線與AUC值 ROC曲線與AUC ROC和AUC的區別 keras 上添加 roc auc指標 sklearn 繪制roc曲線 AUC （ROC曲線下方的面積大小） AUC，ROC我看到的最透徹的講解多分類下的ROC曲線和AUC