python朴素貝葉斯分類MNIST數據集


實現代碼:
import struct
from numpy import *
import numpy as np
import time
def read_image(file_name):
#先用二進制方式把文件都讀進來
file_handle=open(file_name,"rb") #以二進制打開文檔
file_content=file_handle.read() #讀取到緩沖區中
offset=0
head = struct.unpack_from('>IIII', file_content, offset) # 取前4個整數,返回一個元組
offset += struct.calcsize('>IIII')
imgNum = head[1] #圖片數
rows = head[2] #寬度
cols = head[3] #高度

images=np.empty((imgNum , 784))#empty,是它所常見的數組內的所有元素均為空,沒有實際意義,它是創建數組最快的方法
image_size=rows*cols#單個圖片的大小
fmt='>' + str(image_size) + 'B'#單個圖片的format

for i in range(imgNum):
images[i] = np.array(struct.unpack_from(fmt, file_content, offset))
# images[i] = np.array(struct.unpack_from(fmt, file_content, offset)).reshape((rows, cols))
offset += struct.calcsize(fmt)
return images

#讀取標簽
def read_label(file_name):
file_handle = open(file_name, "rb") # 以二進制打開文檔
file_content = file_handle.read() # 讀取到緩沖區中

head = struct.unpack_from('>II', file_content, 0) # 取前2個整數,返回一個元組
offset = struct.calcsize('>II')

labelNum = head[1] # label數
# print(labelNum)
bitsString = '>' + str(labelNum) + 'B' # fmt格式:'>47040000B'
label = struct.unpack_from(bitsString, file_content, offset) # 取data數據,返回一個元組
return np.array(label)

def loadDataSet():
#mnist
train_x_filename="train-images-idx3-ubyte"
train_y_filename="train-labels-idx1-ubyte"
test_x_filename="t10k-images-idx3-ubyte"
test_y_filename="t10k-labels-idx1-ubyte"

# #fashion mnist
# train_x_filename="fashion-train-images-idx3-ubyte"
# train_y_filename="fashion-train-labels-idx1-ubyte"
# test_x_filename="fashion-t10k-images-idx3-ubyte"
# test_y_filename="fashion-t10k-labels-idx1-ubyte"

train_x=read_image(train_x_filename)#60000*784 的矩陣
train_y=read_label(train_y_filename)#60000*1的矩陣
test_x=read_image(test_x_filename)#10000*784
test_y=read_label(test_y_filename)#10000*1

train_x=normalize(train_x)
test_x=normalize(test_x)
# #調試的時候讓速度快點,就先減少數據集大小
# train_x=train_x[0:1000,:]
# train_y=train_y[0:1000]
# test_x=test_x[0:500,:]
# test_y=test_y[0:500]

return train_x, test_x, train_y, test_y

def normalize(data):#圖片像素二值化,變成0-1分布
m=data.shape[0]
n=np.array(data).shape[1]
for i in range(m):
for j in range(n):
if data[i,j]!=0:
data[i,j]=1
else:
data[i,j]=0
return data

#(1)計算先驗概率及條件概率
def train_model(train_x,train_y,classNum):#classNum是指有10個類別,這里的train_x是已經二值化,
m=train_x.shape[0]
n=train_x.shape[1]
# prior_probability=np.zeros(n)#先驗概率
prior_probability=np.zeros(classNum)#先驗概率
conditional_probability=np.zeros((classNum,n,2))#條件概率
#計算先驗概率和條件概率
for i in range(m):#m是圖片數量,共60000張
img=train_x[i]#img是第i個圖片,是1*n的行向量
label=train_y[i]#label是第i個圖片對應的label
prior_probability[label]+=1#統計label類的label數量(p(Y=ck),下標用來存放label,prior_probability[label]除以n就是某個類的先驗概率
for j in range(n):#n是特征數,共784個
temp=img[j].astype(int)#img[j]是0.0,放到下標去會顯示錯誤,只能用整數

conditional_probability[label][j][temp] += 1

# conditional_probability[label][j][img[j]]+=1#統計的是類為label的,在每個列中為1或者0的行數為多少,img[j]的值要么就是0要么就是1,計算條件概率

#將概率歸到[1.10001]
for i in range(classNum):
for j in range(n):
#經過二值化的圖像只有0,1兩種取值
pix_0=conditional_probability[i][j][0]
pix_1=conditional_probability[i][j][1]

#計算0,1像素點對應的條件概率
probability_0=(float(pix_0)/float(pix_0+pix_1))*10000+1
probability_1 = (float(pix_1)/float(pix_0 + pix_1)) * 10000 + 1

conditional_probability[i][j][0]=probability_0
conditional_probability[i][j][1]=probability_1
return prior_probability,conditional_probability

#(2)對給定的x,計算先驗概率和條件概率的乘積
def cal_probability(img,label,prior_probability,conditional_probability):
probability=int(prior_probability[label])#先驗概率
n=img.shape[0]
# print(n)
for i in range(n):#應該是特征數
probability*=int(conditional_probability[label][i][img[i].astype(int)])

return probability

#確定實例x的類,相當於argmax
def predict(test_x,test_y,prior_probability,conditional_probability):#傳進來的test_x或者是train_x都是二值化后的
predict_y=[]
m=test_x.shape[0]
n=test_x.shape[1]
for i in range(m):
img=np.array(test_x[i])#img已經是二值化以后的列向量
label=test_y[i]
max_label=0
max_probability= cal_probability(img,0,prior_probability,conditional_probability)
for j in range(1,10):#從下標為1開始,因為初始值是下標為0
probability=cal_probability(img,j,prior_probability,conditional_probability)
if max_probability<probability:
max_probability=probability
max_label=j
predict_y.append(max_label)#用來記錄每行最大概率的label
return np.array(predict_y)

def cal_accuracy(test_y,predict_y):
m=test_y.shape[0]
errorCount=0.0
for i in range(m):
if test_y[i]!=predict_y[i]:
errorCount+=1
accuracy=1.0-float(errorCount)/m
return accuracy

if __name__=='__main__':
classNum=10
print("Start reading data...")
time1=time.time()
train_x, test_x, train_y, test_y=loadDataSet()
train_x=normalize(train_x)
test_x=normalize(test_x)

time2=time.time()
print("read data cost",time2-time1,"second")

print("start training data...")
prior_probability, conditional_probability=train_model(train_x,train_y,classNum)
for i in range(classNum):
print(prior_probability[i])#輸出一下每個標簽的總共數量
time3=time.time()
print("train data cost",time3-time2,"second")

print("start predicting data...")
predict_y=predict(test_x,test_y,prior_probability,conditional_probability)
time4=time.time()
print("predict data cost",time4-time3,"second")

print("start calculate accuracy...")
acc=cal_accuracy(test_y,predict_y)
time5=time.time()
print("accuarcy",acc)
print("calculate accuarcy cost",time5-time4,"second")
結果截圖:輸出的5923.0.。。這些是我輸出一下每個類別的圖片有幾張。

調用自己寫的朴素貝葉斯函數正確率是84.12%,調用sklearn中的BernoulliNB函數,正確率是84.27%

調用sklearn中的BernoulliNB函數的代碼如下:

結果截屏:

 

優化:加入主成分分析方法,進行降維操作,代碼如下:

結果截屏:

待修改中!

參考鏈接;https://blog.csdn.net/wds2006sdo/article/details/51967839


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM