P(y|X)=P(y)*P(X|y)/P(X)
樣本中的屬性相互獨立;
原問題的等價問題為:
數據處理
為防止P(y)*P(X|y)的值下溢,對原問題取對數,即:
注意:若某屬性值在訓練集中沒有與某個類同時出現過,則直接P(y)或P(X|y)可能為0,這樣計算出P(y)*P(X|y)的值為0,沒有可比性,且不便於求對數,因此需要對概率值進行“平滑”處理,常用拉普拉斯修正。
先驗概率修正:令Dy表示訓練集D中第y類樣本組合的集合,N表示訓練集D中可能的類別數
即每個類別的樣本個數都加 1。
類條件概率:另Dy,xi表示Dc中在第 i 個屬性上取值為xi的樣本組成的集合,Ni表示第 i 個屬性可能的取值數
即該類別中第 i 個屬性都增加一個樣本。
--------------------------------------------------------------
數據預處理
訓練模型
測試樣本
函數調用
參考
python朴素貝葉斯分類MNIST數據集
import struct from numpy import * import numpy as np import time def read_image(file_name): #先用二進制方式把文件都讀進來 file_handle=open(file_name,"rb") #以二進制打開文檔 file_content=file_handle.read() #讀取到緩沖區中 offset=0 head = struct.unpack_from('>IIII', file_content, offset) # 取前4個整數,返回一個元組 offset += struct.calcsize('>IIII') imgNum = head[1] #圖片數 rows = head[2] #寬度 cols = head[3] #高度 images=np.empty((imgNum , 784))#empty,是它所常見的數組內的所有元素均為空,沒有實際意義,它是創建數組最快的方法 image_size=rows*cols#單個圖片的大小 fmt='>' + str(image_size) + 'B'#單個圖片的format for i in range(imgNum): images[i] = np.array(struct.unpack_from(fmt, file_content, offset)) # images[i] = np.array(struct.unpack_from(fmt, file_content, offset)).reshape((rows, cols)) offset += struct.calcsize(fmt) return images #讀取標簽 def read_label(file_name): file_handle = open(file_name, "rb") # 以二進制打開文檔 file_content = file_handle.read() # 讀取到緩沖區中 head = struct.unpack_from('>II', file_content, 0) # 取前2個整數,返回一個元組 offset = struct.calcsize('>II') labelNum = head[1] # label數 # print(labelNum) bitsString = '>' + str(labelNum) + 'B' # fmt格式:'>47040000B' label = struct.unpack_from(bitsString, file_content, offset) # 取data數據,返回一個元組 return np.array(label) def loadDataSet(): #mnist train_x_filename="train-images-idx3-ubyte" train_y_filename="train-labels-idx1-ubyte" test_x_filename="t10k-images-idx3-ubyte" test_y_filename="t10k-labels-idx1-ubyte" # #fashion mnist # train_x_filename="fashion-train-images-idx3-ubyte" # train_y_filename="fashion-train-labels-idx1-ubyte" # test_x_filename="fashion-t10k-images-idx3-ubyte" # test_y_filename="fashion-t10k-labels-idx1-ubyte" train_x=read_image(train_x_filename)#60000*784 的矩陣 train_y=read_label(train_y_filename)#60000*1的矩陣 test_x=read_image(test_x_filename)#10000*784 test_y=read_label(test_y_filename)#10000*1 train_x=normalize(train_x) test_x=normalize(test_x) # #調試的時候讓速度快點,就先減少數據集大小 # train_x=train_x[0:1000,:] # train_y=train_y[0:1000] # test_x=test_x[0:500,:] # test_y=test_y[0:500] return train_x, test_x, train_y, test_y def normalize(data):#圖片像素二值化,變成0-1分布 m=data.shape[0] n=np.array(data).shape[1] for i in range(m): for j in range(n): if data[i,j]!=0: data[i,j]=1 else: data[i,j]=0 return data #(1)計算先驗概率及條件概率 def train_model(train_x,train_y,classNum):#classNum是指有10個類別,這里的train_x是已經二值化, m=train_x.shape[0] n=train_x.shape[1] # prior_probability=np.zeros(n)#先驗概率 prior_probability=np.zeros(classNum)#先驗概率 conditional_probability=np.zeros((classNum,n,2))#條件概率 #計算先驗概率和條件概率 for i in range(m):#m是圖片數量,共60000張 img=train_x[i]#img是第i個圖片,是1*n的行向量 label=train_y[i]#label是第i個圖片對應的label prior_probability[label]+=1#統計label類的label數量(p(Y=ck),下標用來存放label,prior_probability[label]除以n就是某個類的先驗概率 for j in range(n):#n是特征數,共784個 temp=img[j].astype(int)#img[j]是0.0,放到下標去會顯示錯誤,只能用整數 conditional_probability[label][j][temp] += 1 # conditional_probability[label][j][img[j]]+=1#統計的是類為label的,在每個列中為1或者0的行數為多少,img[j]的值要么就是0要么就是1,計算條件概率 #將概率歸到[1.10001] for i in range(classNum): for j in range(n): #經過二值化的圖像只有0,1兩種取值 pix_0=conditional_probability[i][j][0] pix_1=conditional_probability[i][j][1] #計算0,1像素點對應的條件概率 probability_0=(float(pix_0)/float(pix_0+pix_1))*10000+1 probability_1 = (float(pix_1)/float(pix_0 + pix_1)) * 10000 + 1 conditional_probability[i][j][0]=probability_0 conditional_probability[i][j][1]=probability_1 return prior_probability,conditional_probability #(2)對給定的x,計算先驗概率和條件概率的乘積 def cal_probability(img,label,prior_probability,conditional_probability): probability=int(prior_probability[label])#先驗概率 n=img.shape[0] # print(n) for i in range(n):#應該是特征數 probability*=int(conditional_probability[label][i][img[i].astype(int)]) return probability #確定實例x的類,相當於argmax def predict(test_x,test_y,prior_probability,conditional_probability):#傳進來的test_x或者是train_x都是二值化后的 predict_y=[] m=test_x.shape[0] n=test_x.shape[1] for i in range(m): img=np.array(test_x[i])#img已經是二值化以后的列向量 label=test_y[i] max_label=0 max_probability= cal_probability(img,0,prior_probability,conditional_probability) for j in range(1,10):#從下標為1開始,因為初始值是下標為0 probability=cal_probability(img,j,prior_probability,conditional_probability) if max_probability<probability: max_probability=probability max_label=j predict_y.append(max_label)#用來記錄每行最大概率的label return np.array(predict_y) def cal_accuracy(test_y,predict_y): m=test_y.shape[0] errorCount=0.0 for i in range(m): if test_y[i]!=predict_y[i]: errorCount+=1 accuracy=1.0-float(errorCount)/m return accuracy if __name__=='__main__': classNum=10 print("Start reading data...") time1=time.time() train_x, test_x, train_y, test_y=loadDataSet() train_x=normalize(train_x) test_x=normalize(test_x) time2=time.time() print("read data cost",time2-time1,"second") print("start training data...") prior_probability, conditional_probability=train_model(train_x,train_y,classNum) for i in range(classNum): print(prior_probability[i])#輸出一下每個標簽的總共數量 time3=time.time() print("train data cost",time3-time2,"second") print("start predicting data...") predict_y=predict(test_x,test_y,prior_probability,conditional_probability) time4=time.time() print("predict data cost",time4-time3,"second") print("start calculate accuracy...") acc=cal_accuracy(test_y,predict_y) time5=time.time() print("accuarcy",acc) print("calculate accuarcy cost",time5-time4,"second")