1. 貝葉斯定理
如果有兩個事件,事件
依據式(1)能夠推出貝葉斯定理為
給定一個全集
則廣義的貝葉斯定理有
2. 朴素貝葉斯基本原理
給定一組訓練數據集
如果如今給定一個新的樣本
那怎樣求解出這些后驗概率呢?依據貝葉斯定理。有
一般地,朴素貝葉斯方法如果各個特征之間是相互獨立的,則式(5)能夠寫成:
由於(6)式的分母。對於每個
以下,是怎樣通過樣本對
3. 朴素貝葉斯法的參數預計
3.1 極大似然預計
在朴素貝葉斯法中,學習就是意味着預計先驗概率
當中,預計先驗概率和條件概率的方法有非常多,比方極大似然預計,多項式。高斯。伯努利等。
當中,在極大似然預計中,先驗概率
如果輸入樣本的第
樣例1
該樣例來自李航的《統計學習方法》。
表中
試求。
數據例如以下所看到的。當中,特征
import numpy as np
import pandas as pd
x1 = np.array([1,1,1,1,1,2,2,2,2,2,3,3,3,3,3])
x2 = np.array([0,1,1,0,0,0,1,1,2,2,2,1,1,2,2])
y = np.array([-1,-1,1,1,-1,-1,-1,1,1,1,1,1,1,1,-1])
dataSet = np.concatenate((x1[:,None],x2[:,None],y[:,None]),axis=1)
df = pd.DataFrame(dataSet,index=np.arange(1,16,1),columns=['X1','X2','y'])
df.T
1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
X1 | 1 | 1 | 1 | 1 | 1 | 2 | 2 | 2 | 2 | 2 | 3 | 3 | 3 | 3 | 3 |
X2 | 0 | 1 | 1 | 0 | 0 | 0 | 1 | 1 | 2 | 2 | 2 | 1 | 1 | 2 | 2 |
y | -1 | -1 | 1 | 1 | -1 | -1 | -1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | -1 |
求解
step1: 求解先驗概率
step2 求解條件概率
(2.1) 特征
(2.2) 特征
step3 求解后驗概率
由於
例如以下是python的極大似然預計的朴素貝葉斯代碼,代碼執行結果跟求解一致。
class MLENB:
""" Maximum likelihood estimation Naive Bayes Attributes ---------- class_prior_ : array, shape (n_classes, ) Smoothed empirical probability for each class. class_count_: array, shape (n_classes,) number of training samples observed in each class. MLE_: array, shape(n_classes, n_features) Maximum likelihood estimation of each feature per class, each of element is a dict """
def __init__(self):
pass
def fit(self,X,y):
"""Fit maximum likelihood estimation Naive Bayes according to X, y Parameters ---------- X : array-like, shape (n_samples, n_features) Training vectors, where n_samples is the number of samples and n_features is the number of features. y : array-like, shape (n_samples,) Target values. Returns ------- self : object Returns self. """
n_samples = X.shape[0]
n_features = X.shape[1]
n_classes = len(set(y))
self.class_count_ = np.empty(n_classes)
self.class_prior_ = np.empty(n_classes)
self.MLE_ = np.empty((n_classes,n_features),dtype=dict)
self.target_unique = np.unique(y)
for i in range(n_classes):
dataX_tu = X[y == self.target_unique[i]]
self.class_prior_[i] = dataX_tu.shape[0] / float(len(y))
self.class_count_[i] = dataX_tu.shape[0]
for j in range(n_features):
feature = dataX_tu[:,j]
feature_unique = np.unique(feature)
fp = {}
for f_item in feature_unique:
fp[f_item] = list(feature).count(f_item) / float(len(feature))
self.MLE_[i,j] = fp
return self
def __predict_likelihood(self,x):
if x.ndim == 1:
x = np.array([x])
n_samples = x.shape[0]
n_features = x.shape[1]
n_classes = len(self.class_count_)
likelihood = []
for x_item in x:
class_p = []
for i in range(n_classes):
p = self.class_prior_[i]
for j in range(n_features):
if x_item[j] in self.MLE_[i,j]:
p *= self.MLE_[i,j][x_item[j]]
else:
p *= 0
class_p.append(p)
likelihood.append(class_p)
return np.array(likelihood)
def predict(self,x):
"""Perform classification on an array of test vectors X. Parameters ---------- X : array-like, shape = [n_samples, n_features] Returns ------- C : array, shape = [n_samples] Predicted target values for X """
likelihood = self.__predict_likelihood(x)
max_index = np.argmax(likelihood, axis=1)
return np.array([self.target_unique[i] for i in max_index])
def predict_proba(self,x):
""" Return probability estimates for the test vector X. Parameters ---------- X : array-like, shape = [n_samples, n_features] Returns ------- C : array-like, shape = [n_samples, n_classes] Returns the probability of the samples for each class in the model. The columns correspond to the classes in sorted order, as they appear in the attribute `classes_`. """
likelihood = self.__predict_likelihood(x)
return np.array([lh / np.sum(lh) for lh in likelihood])
# 測驗結果
X = dataSet[:,0:-1]
y = dataSet[:,-1]
mlenb = MLENB()
mlenb.fit(X,y)
print(mlenb.predict(np.array([2,0])))
print(mlenb.predict_proba(np.array([2,0])))
[-1]
[[ 0.75 0.25]]
3.2 Multinomial Naive Bayes
用極大似然預計可能會出現所要預計的概率值為0的情況。
這時會影響到后驗概率的計算結果,使分類產生偏差。這時。能夠採用多項式模型,對先驗概率和條件概率做一些平滑處理。詳細公式為:
先驗概率
如果輸入樣本的第
當中。
有個疑問:多項式朴素貝葉斯與李航《統計學習方法》中說的貝葉斯預計有啥差別?本文的方法是參考李航的貝葉斯預計。
python的多項式朴素貝葉斯的參考代碼例如以下:
class MultinomialNB:
"""Naive Bayes classifier for multinomial models Attributes ---------- class_prior_ : array, shape (n_classes, ) Smoothed empirical probability for each class. class_count_: array, shape (n_classes,) number of training samples observed in each class. bayes_estimation_: array, shape(n_classes, n_features) bayes estimations of each feature per class, each of element is a dict """
def __init__(self, alpha=1.0):
self.alpha_ = 1.0
def fit(self,X,y):
n_samples = X.shape[0]
n_features = X.shape[1]
n_classes = len(set(y))
self.class_count_ = np.empty(n_classes)
self.class_prior_ = np.empty(n_classes)
self.bayes_estimation_ = np.empty((n_classes,n_features),dtype=dict)
self.target_unique = np.unique(y)
for i in range(n_classes):
dataX_tu = X[y == self.target_unique[i]]
self.class_prior_[i] = (dataX_tu.shape[0] + self.alpha_) / (float(len(y)) + n_classes * self.alpha_)
self.class_count_[i] = dataX_tu.shape[0]
for j in range(n_features):
feature = dataX_tu[:,j]
feature_unique = np.unique(feature)
fp = {}
for f_item in feature_unique:
fp[f_item] = (list(feature).count(f_item) + self.alpha_) / (float(len(feature)) + len(feature_unique) * self.alpha_)
self.bayes_estimation_[i,j] = fp
return self
def __predict_likelihood(self,x):
if x.ndim == 1:
x = np.array([x])
n_samples = x.shape[0]
n_features = x.shape[1]
n_classes = len(self.class_count_)
likelihood = []
for x_item in x:
class_p = []
for i in range(n_classes):
p = self.class_prior_[i]
for j in range(n_features):
if x_item[j] in self.bayes_estimation_[i,j]:
p *= self.bayes_estimation_[i,j][x_item[j]]
else:
p *= 0
class_p.append(p)
likelihood.append(class_p)
return np.array(likelihood)
def predict(self,x):
likelihood = self.__predict_likelihood(x)
max_index = np.argmax(likelihood, axis=1)
return np.array([self.target_unique[i] for i in max_index])
def predict_proba(self,x):
likelihood = self.__predict_likelihood(x)
return np.array([lh / np.sum(lh) for lh in likelihood])
# 測驗結果
X = dataSet[:,0:-1]
y = dataSet[:,-1]
mnb = MultinomialNB()
mnb.fit(X,y)
print(mnb.predict(np.array([2,0])))
print(mnb.predict_proba(np.array([2,0])))
[-1]
[[ 0.65116279 0.34883721]]
3.3 Gaussian Naive Bayes
當輸入的特征是連續值的時候,我們無法用上面的方法來預計先驗概率和條件概率,能夠採用高斯模型。
高斯模型如果特征服從高斯分布。
其特征的似然預計例如以下所看到的:
當中。
其python代碼例如以下:
class GaussianNB:
""" Attributes ---------- class_prior_ : array, shape (n_classes,) probability of each class. class_count_ : array, shape (n_classes,) number of training samples observed in each class. theta_ : array, shape (n_classes, n_features) mean of each feature per class sigma_ : array, shape (n_classes, n_features) variance of each feature per class """
def __init__(self):
pass
def fit(self, X, y):
n_samples = X.shape[0]
n_features = X.shape[1]
n_classes = len(set(y))
self.theta_ = np.zeros([n_classes,n_features])
self.sigma_ = np.zeros([n_classes,n_features])
self.class_prior = np.zeros(n_classes)
self.class_count = np.zeros(n_classes)
self.target_unique = np.unique(y)
for i in range(n_classes):
dataX_tu = X[y == self.target_unique[i]]
self.class_prior[i] = dataX_tu.shape[0] / float(len(y))
self.class_count[i] = dataX_tu.shape[0]
self.theta_[i,:] = np.mean(dataX_tu,axis=0)
self.sigma_[i,:] = np.var(dataX_tu,axis=0)
return self
def __predict_likelihood(self,x):
if x.ndim == 1:
x = np.array([x])
n_samples = x.shape[0]
likelihood = []
for x_item in x:
gaussian = np.exp(-(x_item-self.theta_)**2 / (2 * self.sigma_)) / np.sqrt(2*np.pi*self.sigma_)
p = np.exp(np.sum(np.log(gaussian),axis=1))
likelihood.append(self.class_prior * p)
return np.array(likelihood)
def predict(self,x):
likelihood = self.__predict_likelihood(x)
max_index = np.argmax(likelihood, axis=1)
return np.array([self.target_unique[i] for i in max_index])
def predict_proba(self,x):
likelihood = self.__predict_likelihood(x)
return np.array([lh / np.sum(lh) for lh in likelihood])
# 測驗結果
X = dataSet[:,0:-1]
y = dataSet[:,-1]
gnb = GaussianNB()
gnb.fit(X,y)
print(gnb.predict(np.array([2,0])))
print(gnb.predict_proba(np.array([2,0])))
[-1]
[[ 0.74566865 0.25433135]]
3.4 Bernoulli Naive Bayes
5. Naive Bayes 注意事項
- Works only with categorical predictors, numerical predictors must be categorized or binned before use
- Works with the assumption of predictor independence, and thus cannot detect or account for relationships between the predictors, unlike a decision tree for example.