手寫數字識別,神經網絡領域的“hello world”例子,通過pytorch一步步構建,通過訓練與調整,達到“100%”准確率
1、快速開始
1.1 定義神經網絡類,繼承torch.nn.Module,文件名為digit_recog.py
1 import torch.nn as nn 2 3 4 class Net(nn.Module): 5 def __init__(self): 6 super(Net, self).__init__() 7 self.conv1 = nn.Sequential(nn.Conv2d(1, 6, 5, 1, 2) 8 , nn.ReLU() 9 , nn.MaxPool2d(2, 2)) 10 self.conv2 = nn.Sequential(nn.Conv2d(6, 16, 5) 11 , nn.ReLU() 12 , nn.MaxPool2d(2, 2)) 13 self.fc1 = nn.Sequential( 14 nn.Linear(16 * 5 * 5, 120), 15 # nn.Dropout2d(), 16 nn.ReLU() 17 ) 18 self.fc2 = nn.Sequential( 19 nn.Linear(120, 84), 20 nn.Dropout2d(), 21 nn.ReLU() 22 ) 23 self.fc3 = nn.Linear(84, 10) 24 25 # 前向傳播 26 def forward(self, x): 27 x = self.conv1(x) 28 x = self.conv2(x) 29 # 線性層的輸入輸出都是一維數據,所以要把多維度的tensor展平成一維 30 x = x.view(x.size()[0], -1) 31 x = self.fc1(x) 32 x = self.fc2(x) 33 x = self.fc3(x) 34 return x
上面的類定義了一個3層的網絡結構,根據問題類型,最后一層是確定的

1.2 開始訓練:
import torch
import torchvision as tv
import torchvision.transforms as transforms
import torch.nn as nn
import torch.optim as optim
import os
import copy
import time
from digit_recog import Net
from digit_recog_mydataset import MyDataset
# 讀取已保存的模型
def getmodel(pth, net):
state_filepath = pth
if os.path.exists(state_filepath):
# 加載參數
nn_state = torch.load(state_filepath)
# 加載模型
net.load_state_dict(nn_state)
# 拷貝一份
return copy.deepcopy(nn_state)
else:
return net.state_dict()
# 構建數據集
def getdataset(batch_size):
# 定義數據預處理方式
transform = transforms.ToTensor()
# 定義訓練數據集
trainset = tv.datasets.MNIST(
root='./data/',
train=True,
download=True,
transform=transform)
# 去掉注釋,加入自己的數據集
# trainset += MyDataset(os.path.abspath("./data/myimages/"), 'train.txt', transform=transform)
# 定義訓練批處理數據
trainloader = torch.utils.data.DataLoader(
trainset,
batch_size=batch_size,
shuffle=True,
)
# 定義測試數據集
testset = tv.datasets.MNIST(
root='./data/',
train=False,
download=True,
transform=transform)
# 去掉注釋,加入自己的數據集
# testset += MyDataset(os.path.abspath("./data/myimages/"), 'test.txt', transform=transform)
# 定義測試批處理數據
testloader = torch.utils.data.DataLoader(
testset,
batch_size=batch_size,
shuffle=False,
)
return trainloader, testloader
# 訓練
def training(device, net, model, dataset_loader, epochs, criterion, optimizer, save_model_path):
trainloader, testloader = dataset_loader
# 最佳模型
best_model_wts = model
# 最好分數
best_acc = 0.0
# 計時
since = time.time()
for epoch in range(epochs):
sum_loss = 0.0
# 訓練數據集
for i, data in enumerate(trainloader):
inputs, labels = data
inputs, labels = inputs.to(device), labels.to(device)
# 梯度清零,避免帶入下一輪累加
optimizer.zero_grad()
# 神經網絡運算
outputs = net(inputs)
# 損失值
loss = criterion(outputs, labels)
# 損失值反向傳播
loss.backward()
# 執行優化
optimizer.step()
# 損失值匯總
sum_loss += loss.item()
# 每訓練完100條數據就顯示一下損失值
if i % 100 == 99:
print('[%d, %d] loss: %.03f'
% (epoch + 1, i + 1, sum_loss / 100))
sum_loss = 0.0
# 每訓練完一輪測試一下准確率
with torch.no_grad():
correct = 0
total = 0
for data in testloader:
images, labels = data
images, labels = images.to(device), labels.to(device)
outputs = net(images)
# 取得分最高的
_, predicted = torch.max(outputs.data, 1)
# print(labels)
# print(torch.nn.Softmax(dim=1)(outputs.data).detach().numpy()[0])
# print(torch.nn.functional.normalize(outputs.data).detach().numpy()[0])
total += labels.size(0)
correct += (predicted == labels).sum()
print('測試結果:{}/{}'.format(correct, total))
epoch_acc = correct.double() / total
print('當前分數:{} 最高分數:{}'.format(epoch_acc, best_acc))
if epoch_acc > best_acc:
best_acc = epoch_acc
best_model_wts = copy.deepcopy(net.state_dict())
print('第%d輪的識別准確率為:%d%%' % (epoch + 1, (100 * correct / total)))
time_elapsed = time.time() - since
print('訓練完成於 {:.0f}m {:.0f}s'.format(
time_elapsed // 60, time_elapsed % 60))
print('最高分數: {:4f}'.format(best_acc))
# 保存訓練模型
if save_model_path is not None:
save_state_path = os.path.join('model/', 'net.pth')
torch.save(best_model_wts, save_state_path)
# 基於cpu還是gpu
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
NET = Net().to(DEVICE)
# 超參數設置
EPOCHS = 8# 訓練多少輪
BATCH_SIZE = 64 # 數據集批處理數量 64
LR = 0.001 # 學習率
# 交叉熵損失函數,通常用於多分類問題上
CRITERION = nn.CrossEntropyLoss()
# 優化器
# OPTIMIZER = optim.SGD(net.parameters(), lr=LR, momentum=0.9)
OPTIMIZER = optim.Adam(NET.parameters(), lr=LR)
MODEL = getmodel(os.path.join('model/', 'net.pth'), NET)
training(DEVICE, NET, MODEL, getdataset(BATCH_SIZE), 1, CRITERION, OPTIMIZER, os.path.join('model/', 'net.pth'))
利用標准的mnist數據集跑出來的識別率能達到99%
2、參與進來
目的是為了識別自己的圖片,增加參與感
2.1 打開windows附件中的畫圖工具,用鼠標畫幾個數字,然后用截圖工具保存下來

2.2 實現自己的數據集:
digit_recog_mydataset.py
from PIL import Image
import torch
import os
# 實現自己的數據集
class MyDataset(torch.utils.data.Dataset):
def __init__(self, root, datafile, transform=None, target_transform=None):
super(MyDataset, self).__init__()
fh = open(os.path.join(root, datafile), 'r')
datas = []
for line in fh:
# 刪除本行末尾的字符
line = line.rstrip()
# 通過指定分隔符對字符串進行拆分,默認為所有的空字符,包括空格、換行、制表符等
words = line.split()
# words[0]是圖片信息,words[1]是標簽
datas.append((words[0], int(words[1])))
self.datas = datas
self.transform = transform
self.target_transform = target_transform
self.root = root
# 必須實現的方法,用於按照索引讀取每個元素的具體內容
def __getitem__(self, index):
# 獲取圖片及標簽,即上面每行中word[0]和word[1]的信息
img, label = self.datas[index]
# 打開圖片,重設尺寸,轉換為灰度圖
img = Image.open(os.path.join(self.root, img)).resize((28, 28)).convert('L')
# 數據預處理
if self.transform is not None:
img = self.transform(img)
return img, label
# 必須實現的方法,返回數據集的長度
def __len__(self):
return len(self.datas)
2.3 在圖片文件夾中新建兩個文件,train.txt和test.txt,分別寫上訓練與測試集的數據,格式如下

訓練與測試的數據要嚴格區分開,否則訓練出來的模型會有問題
2.4 加入訓練、測試數據集
反注釋訓練方法中的這兩行
# trainset += MyDataset(os.path.abspath("./data/myimages/"), 'train.txt', transform=transform)
# testset += MyDataset(os.path.abspath("./data/myimages/"), 'test.txt', transform=transform)
繼續執行訓練,這里我訓練出來的最高識別率是98%
2.5 測試模型
# -*- coding: utf-8 -*-
# encoding:utf-8
import torch
import numpy as np
from PIL import Image
import os
import matplotlib
import matplotlib.pyplot as plt
import glob
from digit_recog import Net
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
net = Net().to(device)
# 加載參數
nn_state = torch.load(os.path.join('model/', 'net.pth'))
# 參數加載到指定模型
net.load_state_dict(nn_state)
# 指定默認字體
matplotlib.rcParams['font.sans-serif'] = ['SimHei']
matplotlib.rcParams['font.family'] = 'sans-serif'
# 解決負號'-'顯示為方塊的問題
matplotlib.rcParams['axes.unicode_minus'] = False
# 要識別的圖片
file_list = glob.glob(os.path.join('data/test_image/', '*'))
grid_rows = len(file_list) / 5 + 1
for i, file in enumerate(file_list):
# 讀取圖片並重設尺寸
image = Image.open(file).resize((28, 28))
# 灰度圖
gray_image = image.convert('L')
# 圖片數據處理
im_data = np.array(gray_image)
im_data = torch.from_numpy(im_data).float()
im_data = im_data.view(1, 1, 28, 28)
# 神經網絡運算
outputs = net(im_data)
# 取最大預測值
_, pred = torch.max(outputs, 1)
# print(torch.nn.Softmax(dim=1)(outputs).detach().numpy()[0])
# print(torch.nn.functional.normalize(outputs).detach().numpy()[0])
# 顯示圖片
plt.subplot(grid_rows, 5, i + 1)
plt.imshow(gray_image)
plt.title(u"你是{}?".format(pred.item()), fontsize=8)
plt.axis('off')
print('[{}]預測數字為: [{}]'.format(file, pred.item()))
plt.show()
可視化結果

這批圖片是經過圖片增強后識別的結果,准確率有待提高
3、優化
3.1 更多樣本:
收集難度大
3.2 數據增強:
簡單地處理一下自己手寫的數字圖片
# -*- coding: utf-8 -*-
# encoding:utf-8
import torch
import numpy as np
from PIL import Image
import os
import matplotlib
import matplotlib.pyplot as plt
import glob
from scipy.ndimage import filters
class ImageProcceed:
def __init__(self, image_folder):
self.image_folder = image_folder
def save(self, rotate, filter=None, to_gray=True):
file_list = glob.glob(os.path.join(self.image_folder, '*.png'))
print(len(file_list))
for i, file in enumerate(file_list):
# 讀取圖片數據
image = Image.open(file) # .resize((28, 28))
# 灰度圖
if to_gray == True:
image = image.convert('L')
# 旋轉
image = image.rotate(rotate)
if filter is not None:
image = filters.gaussian_filter(image, 0.5)
image = Image.fromarray(image)
filename = os.path.basename(file)
fileext = os.path.splitext(filename)[1]
savefile = filename.replace(fileext, '-rt{}{}'.format(rotate, fileext))
print(savefile)
image.save(os.path.join(self.image_folder, savefile))
ip = ImageProcceed('data/myimages/')
ip.save(20, filter=0.5)
3.3 改變網絡大小:
比如把上面的Net類中的3層改為2層
3.4 調參:
改變學習率,訓練更多次數等
后面我調整了Net類中的兩個地方,准確率終於達到100%,這只是在我小批量測試集上的表現而已,而現實中預測是不可能達到100%的,每台機器可能有差異,每次運行的結果會有不同,再次帖出代碼
1 import torch.nn as nn 2 3 4 class Net(nn.Module): 5 def __init__(self): 6 super(Net, self).__init__() 7 # 卷積: 1通道輸入,6通道輸出,卷積核5*5,步長1,前后補2個0 8 # 激活函數一般用ReLU,后面改良的有LeakyReLU/PReLU 9 # MaxPool2d池化,一般是2 10 self.conv1 = nn.Sequential(nn.Conv2d(1, 6, 5, 1, 2) 11 , nn.PReLU() 12 , nn.MaxPool2d(2, 2)) 13 self.conv2 = nn.Sequential(nn.Conv2d(6, 16, 5) 14 , nn.PReLU() 15 , nn.MaxPool2d(2, 2)) 16 self.fc1 = nn.Sequential( 17 nn.Linear(16 * 5 * 5, 120), # 卷積輸出16,乘以卷積核5*5 18 # nn.Dropout2d(), # Dropout接收來自linear的數據,Dropout2d接收來自conv2d的數據 19 nn.PReLU() 20 ) 21 self.fc2 = nn.Sequential( 22 nn.Linear(120, 84), 23 nn.Dropout(p=0.2), 24 nn.PReLU() 25 ) 26 self.fc3 = nn.Linear(84, 10) # 輸出層節點為10,代表數字0-9 27 28 # 前向傳播 29 def forward(self, x): 30 x = self.conv1(x) 31 x = self.conv2(x) 32 # 線性層的輸入輸出都是一維數據,所以要把多維度的tensor展平成一維 33 x = x.view(x.size()[0], -1) 34 x = self.fc1(x) 35 x = self.fc2(x) 36 x = self.fc3(x) 37 return x
上面改了兩個地方,一個是激活函數ReLU改成了PReLU,正則化Dropout用0.2作為參數,下面是再次運行測試后的結果

