邏輯回歸閾值修改
#使用sklearn乳腺癌數據集驗證
from sklearn.datasets import load_breast_cancer
from sklearn.linear_model import LogisticRegression as LR
import numpy as np
np.set_printoptions(suppress=True)
data = load_breast_cancer()
lr = LR().fit(data.data,data.target)
#訓練預測
print(lr.predict(data.data))
print(lr.predict_proba(data.data))
打印結果如下:
- predict(x):直接輸出0-1二分類結果
- predict_proba(x): 分別輸出0-1的概率可能性
#預測源碼
def predict(self, X):
"""
Predict class labels for samples in X.
Parameters
----------
X : array-like or sparse matrix, shape (n_samples, n_features)
Samples.
Returns
-------
C : array, shape [n_samples]
Predicted class label per sample.
"""
scores = self.decision_function(X)
if len(scores.shape) == 1:
indices = (scores > 0).astype(int)
else:
indices = scores.argmax(axis=1)
return self.classes_[indices]
# 分類概率源碼
def predict_proba(self, X):
"""
Probability estimates.
The returned estimates for all classes are ordered by the
label of classes.
For a multi_class problem, if multi_class is set to be "multinomial"
the softmax function is used to find the predicted probability of
each class.
Else use a one-vs-rest approach, i.e calculate the probability
of each class assuming it to be positive using the logistic function.
and normalize these values across all the classes.
Parameters
----------
X : array-like of shape (n_samples, n_features)
Vector to be scored, where `n_samples` is the number of samples and
`n_features` is the number of features.
Returns
-------
T : array-like of shape (n_samples, n_classes)
Returns the probability of the sample for each class in the model,
where classes are ordered as they are in ``self.classes_``.
"""
check_is_fitted(self)
ovr = (self.multi_class in ["ovr", "warn"] or
(self.multi_class == 'auto' and (self.classes_.size <= 2 or
self.solver == 'liblinear')))
if ovr:
return super()._predict_proba_lr(X)
else:
decision = self.decision_function(X)
if decision.ndim == 1:
# Workaround for multi_class="multinomial" and binary outcomes
# which requires softmax prediction with only a 1D decision.
decision_2d = np.c_[-decision, decision]
else:
decision_2d = decision
return softmax(decision_2d, copy=False)
# 輸出線性結果
def decision_function(self, X):
"""
Predict confidence scores for samples.
The confidence score for a sample is proportional to the signed
distance of that sample to the hyperplane.
Parameters
----------
X : array-like or sparse matrix, shape (n_samples, n_features)
Samples.
Returns
-------
array, shape=(n_samples,) if n_classes == 2 else (n_samples, n_classes)
Confidence scores per (sample, class) combination. In the binary
case, confidence score for self.classes_[1] where >0 means this
class would be predicted.
"""
check_is_fitted(self)
X = check_array(X, accept_sparse='csr')
n_features = self.coef_.shape[1]
if X.shape[1] != n_features:
raise ValueError("X has %d features per sample; expecting %d"
% (X.shape[1], n_features))
scores = safe_sparse_dot(X, self.coef_.T,
dense_output=True) + self.intercept_
return scores.ravel() if scores.shape[1] == 1 else scores
"""
從源碼可以看出:predict()和predict_proba()
都使用了"decision = self.decision_function(X)" 然后進一步處理,
而decision看源碼剛好是 "scores = safe_sparse_dot(X, self.coef_.T,
dense_output=True) +self.intercept_ (截距)""不經過sigmlod的結果
print(lr.decision_function(data.data))))
# 這里直接打印decision結果,打印結果如下:
"""
結論:
從結果可以看是線性擬合函數decision_function的結果,我們可以使用人為加入sigmlod
print(1/(1+np.exp(-lr.decision_function(data.data))))
求取轉化后[0,1]直接的連續值,這是就可以自定義threshold閾值,對結果進行切分或直接當作評分結果直接使用。
備注:
'''LR.predict()直接默認的閾值為0.5,滿足線性擬合函數結果為負值就歸為0,為正值就歸於1,從源碼就可以看出,也沒有經過sigmlod函數'''
scores = self.decision_function(X)
if len(scores.shape) == 1:
indices = (scores > 0).astype(int)
'''LR.predict_proba()從源碼可以看出使用的是softmax()函數得出得結果
如 線性擬合函數值decision為-33,然后就使用 (np.exp(-decision)/(np.exp(decision)+np.exp(-decison))) 結果為 1 代表就是是0類得概率為1,從上面得第一個樣本結果可以看出
'''
decision_2d = np.c_[-decision, decision]
return softmax(decision_2d, copy=False)