機器學習:朴素貝葉斯分類器實現二分類(伯努利型) 代碼+項目實戰


一、朴素貝葉斯分類器的構建

import numpy as np

class BernoulliNavieBayes:

    def __init__(self, alpha=1.):
        # 平滑系數, 默認為1(拉普拉斯平滑).
        self.alpha = alpha

    def _class_prior_proba_log(self, y, classes):
        '''計算所有類別先驗概率P(y=c_k)'''

        # 統計各類別樣本數量
        c_count = np.count_nonzero(y == classes[:, None], axis=1) 
        # 計算各類別先驗概率(平滑修正)
        p = (c_count + self.alpha) / (len(y) + len(classes) * self.alpha)
       
        return np.log(p)

    def _conditional_proba_log(self, X, y, classes):
        '''計算所有條件概率P(x^(j)|y=c_k)的對數'''

        _, n = X.shape
        K = len(classes)

        # P_log: 2個條件概率的對數的矩陣
        # 矩陣P_log[0]存儲所有log(P(x^(j)=0|y=c_k))
        # 矩陣P_log[1]存儲所有log(P(x^(j)=1|y=c_k))
        P_log = np.empty((2, K, n))

        # 迭代每一個類別c_k
        for k, c in enumerate(classes):
            # 獲取類別為c_k的實例
            X_c = X[y == c]
            # 統計各特征值為1的實例的數量
            count1 = np.count_nonzero(X_c, axis=0)
            # 計算條件概率P(x^(j)=1|y=c_k)(平滑修正)
            p1 = (count1 + self.alpha) / (len(X_c) + 2 * self.alpha)
            # 將log(P(x^(j)=0|y=c_k))和log(P(x^(j)=1|y=c_k))存入矩陣
            P_log[0, k] = np.log(1 - p1) 
            P_log[1, k] = np.log(p1) 

        return P_log

    def train(self, X_train, y_train):
        '''訓練模型'''

        # 獲取所有類別
        self.classes = np.unique(y_train)
        # 計算並保存所有先驗概率的對數
        self.pp_log = self._class_prior_proba_log(y_train, self.classes)
        # 計算並保存所有條件概率的對數
        self.cp_log = self._conditional_proba_log(X_train, y_train, self.classes)

    def _predict(self, x):
        '''對單個實例進行預測'''

        K = len(self.classes)
        p_log = np.empty(K)

        # 分別獲取各特征值為1和0的索引
        idx1 = x == 1
        idx0 = ~idx1

        # 迭代每一個類別c_k
        for k in range(K):
            # 計算后驗概率P(c_k|x)分子部分的對數.
            p_log[k] = self.pp_log[k] + np.sum(self.cp_log[0, k][idx0]) \
                                    + np.sum(self.cp_log[1, k][idx1])

        # 返回具有最大后驗概率的類別
        return np.argmax(p_log)

    def predict(self, X):
        '''預測'''

        # 對X中每個實例, 調用_predict進行預測, 收集結果並返回.
        return np.apply_along_axis(self._predict, axis=1, arr=X)

二、數據集的獲取

http://archive.ics.uci.edu/ml/machine-learning-databases/spambase/

三、加載數據與數據轉換

import numpy as np
data=np.loadtxt('F:/python_test/data/spambase.data',delimiter=',')
print(data)
X=data[:,:48]
X=np.where(X>0 , 1, 0)
print(X)

 

 

y=data[:,-1].astype('int')
y

 四、模型擬合、預測與精度

單次訓練

from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3)
clf=BernoulliNavieBayes()
clf.train(X_train,y_train)
from sklearn.metrics import accuracy_score
y_pred=clf.predict(X_test)
print(y_pred)
accuracy=accuracy_score(y_test,y_pred)
print(accuracy)

多次訓練,精確度沒有太多的改變,說明朴素貝葉斯分類器只要很少的樣本就能學習到大部分的特征

def test(X,y,test_size,N):
    acc=np.empty(N)
    for i in range(N):
        X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=test_size)
        clf=BernoulliNavieBayes()
        clf.train(X_train,y_train)
        y_pred=clf.predict(X_test)
        acc[i]=accuracy_score(y_test,y_pred)
    return np.mean(acc)

sizes=np.arange(0.3,1,0.1)
print(sizes)
acc=[test(X,y,test_size,100) for test_size in sizes]
print(acc)

import matplotlib.pyplot as plt
plt.rcParams['font.sans-serif']=['SimHei']
plt.rcParams['axes.unicode_minus'] = False
plt.plot(sizes,acc,linestyle='--',color='red')
plt.ylim([0.87,0.88])
plt.xlabel('test_size/(test_size+trsin_size)')
plt.ylabel('accuracy')
plt.title('精確度趨勢圖')
plt.show()

 


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM