識別手寫數字增強版100% - pytorch從入門到入道(一)


手寫數字識別,神經網絡領域的“hello world”例子,通過pytorch一步步構建,通過訓練與調整,達到“100%”准確率

1、快速開始

1.1 定義神經網絡類,繼承torch.nn.Module,文件名為digit_recog.py

 1 import torch.nn as nn
 2 
 3 
 4 class Net(nn.Module):
 5     def __init__(self):
 6         super(Net, self).__init__()
 7         self.conv1 = nn.Sequential(nn.Conv2d(1, 6, 5, 1, 2)
 8                                    , nn.ReLU()
 9                                    , nn.MaxPool2d(2, 2))
10         self.conv2 = nn.Sequential(nn.Conv2d(6, 16, 5)
11                                    , nn.ReLU()
12                                    , nn.MaxPool2d(2, 2))
13         self.fc1 = nn.Sequential(
14             nn.Linear(16 * 5 * 5, 120),
15 # nn.Dropout2d(),
16 nn.ReLU()
17         )
18         self.fc2 = nn.Sequential(
19             nn.Linear(120, 84),
20 nn.Dropout2d(),
21 nn.ReLU()
22         )
23         self.fc3 = nn.Linear(84, 10)
24 
25     # 前向傳播
26 def forward(self, x):
27         x = self.conv1(x)
28         x = self.conv2(x)
29         # 線性層的輸入輸出都是一維數據,所以要把多維度的tensor展平成一維
30 x = x.view(x.size()[0], -1)
31         x = self.fc1(x)
32         x = self.fc2(x)
33         x = self.fc3(x)
34         return x
 
        

上面的類定義了一個3層的網絡結構,根據問題類型,最后一層是確定的

1.2 開始訓練:

import torch
import torchvision as tv
import torchvision.transforms as transforms
import torch.nn as nn
import torch.optim as optim
import os
import copy
import time
from digit_recog import Net
from digit_recog_mydataset import MyDataset


# 讀取已保存的模型
def getmodel(pth, net):
    state_filepath = pth
    if os.path.exists(state_filepath):
        # 加載參數
nn_state = torch.load(state_filepath)
        # 加載模型
net.load_state_dict(nn_state)
        # 拷貝一份
return copy.deepcopy(nn_state)
    else:
        return net.state_dict()


# 構建數據集
def getdataset(batch_size):
    # 定義數據預處理方式
transform = transforms.ToTensor()

    # 定義訓練數據集
trainset = tv.datasets.MNIST(
        root='./data/',
train=True,
download=True,
transform=transform)

    # 去掉注釋,加入自己的數據集
# trainset += MyDataset(os.path.abspath("./data/myimages/"), 'train.txt', transform=transform)

    # 定義訓練批處理數據
trainloader = torch.utils.data.DataLoader(
        trainset,
batch_size=batch_size,
shuffle=True,
)

    # 定義測試數據集
testset = tv.datasets.MNIST(
        root='./data/',
train=False,
download=True,
transform=transform)

    # 去掉注釋,加入自己的數據集
# testset += MyDataset(os.path.abspath("./data/myimages/"), 'test.txt', transform=transform)

    # 定義測試批處理數據
testloader = torch.utils.data.DataLoader(
        testset,
batch_size=batch_size,
shuffle=False,
)

    return trainloader, testloader


# 訓練
def training(device, net, model, dataset_loader, epochs, criterion, optimizer, save_model_path):
    trainloader, testloader = dataset_loader
    # 最佳模型
best_model_wts = model
    # 最好分數
best_acc = 0.0
# 計時
since = time.time()
    for epoch in range(epochs):
        sum_loss = 0.0
# 訓練數據集
for i, data in enumerate(trainloader):
            inputs, labels = data
            inputs, labels = inputs.to(device), labels.to(device)
            # 梯度清零,避免帶入下一輪累加
optimizer.zero_grad()
            # 神經網絡運算
outputs = net(inputs)
            # 損失值
loss = criterion(outputs, labels)
            # 損失值反向傳播
loss.backward()
            # 執行優化
optimizer.step()
            # 損失值匯總
sum_loss += loss.item()
            # 每訓練完100條數據就顯示一下損失值
if i % 100 == 99:
                print('[%d, %d] loss: %.03f'
% (epoch + 1, i + 1, sum_loss / 100))
                sum_loss = 0.0
# 每訓練完一輪測試一下准確率
with torch.no_grad():
            correct = 0
total = 0
for data in testloader:
                images, labels = data
                images, labels = images.to(device), labels.to(device)
                outputs = net(images)
                # 取得分最高的
_, predicted = torch.max(outputs.data, 1)
                # print(labels)
                # print(torch.nn.Softmax(dim=1)(outputs.data).detach().numpy()[0])
                # print(torch.nn.functional.normalize(outputs.data).detach().numpy()[0])
total += labels.size(0)
                correct += (predicted == labels).sum()

            print('測試結果:{}/{}'.format(correct, total))
            epoch_acc = correct.double() / total
            print('當前分數:{} 最高分數:{}'.format(epoch_acc, best_acc))
            if epoch_acc > best_acc:
                best_acc = epoch_acc
                best_model_wts = copy.deepcopy(net.state_dict())
            print('第%d輪的識別准確率為:%d%%' % (epoch + 1, (100 * correct / total)))

    time_elapsed = time.time() - since
    print('訓練完成於 {:.0f}m {:.0f}s'.format(
        time_elapsed // 60, time_elapsed % 60))
    print('最高分數: {:4f}'.format(best_acc))
    # 保存訓練模型
if save_model_path is not None:
        save_state_path = os.path.join('model/', 'net.pth')
        torch.save(best_model_wts, save_state_path)


# 基於cpu還是gpu
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
NET = Net().to(DEVICE)
# 超參數設置
EPOCHS = 8# 訓練多少輪
BATCH_SIZE = 64  # 數據集批處理數量 64
LR = 0.001  # 學習率

# 交叉熵損失函數,通常用於多分類問題上
CRITERION = nn.CrossEntropyLoss()
# 優化器
# OPTIMIZER = optim.SGD(net.parameters(), lr=LR, momentum=0.9)
OPTIMIZER = optim.Adam(NET.parameters(), lr=LR)
MODEL = getmodel(os.path.join('model/', 'net.pth'), NET)
training(DEVICE, NET, MODEL, getdataset(BATCH_SIZE), 1, CRITERION, OPTIMIZER, os.path.join('model/', 'net.pth'))

利用標准的mnist數據集跑出來的識別率能達到99%

2、參與進來

目的是為了識別自己的圖片,增加參與感

2.1 打開windows附件中的畫圖工具,用鼠標畫幾個數字,然后用截圖工具保存下來

2.2 實現自己的數據集:

digit_recog_mydataset.py

from PIL import Image
import torch
import os


# 實現自己的數據集
class MyDataset(torch.utils.data.Dataset):
    def __init__(self, root, datafile, transform=None, target_transform=None):
        super(MyDataset, self).__init__()
        fh = open(os.path.join(root, datafile), 'r')
        datas = []
        for line in fh:
            # 刪除本行末尾的字符
line = line.rstrip()
            # 通過指定分隔符對字符串進行拆分,默認為所有的空字符,包括空格、換行、制表符等
words = line.split()
            # words[0]是圖片信息,words[1]是標簽
datas.append((words[0], int(words[1])))

        self.datas = datas
        self.transform = transform
        self.target_transform = target_transform
        self.root = root

    # 必須實現的方法,用於按照索引讀取每個元素的具體內容
def __getitem__(self, index):
        # 獲取圖片及標簽,即上面每行中word[0]和word[1]的信息
img, label = self.datas[index]
        # 打開圖片,重設尺寸,轉換為灰度圖
img = Image.open(os.path.join(self.root, img)).resize((28, 28)).convert('L')

        # 數據預處理
if self.transform is not None:
            img = self.transform(img)
        return img, label

    # 必須實現的方法,返回數據集的長度
def __len__(self):
        return len(self.datas)

2.3 在圖片文件夾中新建兩個文件,train.txt和test.txt,分別寫上訓練與測試集的數據,格式如下

 

訓練與測試的數據要嚴格區分開,否則訓練出來的模型會有問題

2.4 加入訓練、測試數據集

反注釋訓練方法中的這兩行

# trainset += MyDataset(os.path.abspath("./data/myimages/"), 'train.txt', transform=transform)

# testset += MyDataset(os.path.abspath("./data/myimages/"), 'test.txt', transform=transform)

繼續執行訓練,這里我訓練出來的最高識別率是98%

2.5 測試模型

# -*- coding: utf-8 -*-
# encoding:utf-8

import torch
import numpy as np
from PIL import Image
import os
import matplotlib
import matplotlib.pyplot as plt
import glob
from digit_recog import Net

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
net = Net().to(device)
# 加載參數
nn_state = torch.load(os.path.join('model/', 'net.pth'))
# 參數加載到指定模型
net.load_state_dict(nn_state)

# 指定默認字體
matplotlib.rcParams['font.sans-serif'] = ['SimHei']
matplotlib.rcParams['font.family'] = 'sans-serif'
# 解決負號'-'顯示為方塊的問題
matplotlib.rcParams['axes.unicode_minus'] = False

# 要識別的圖片
file_list = glob.glob(os.path.join('data/test_image/', '*'))
grid_rows = len(file_list) / 5 + 1

for i, file in enumerate(file_list):
    # 讀取圖片並重設尺寸
image = Image.open(file).resize((28, 28))
    # 灰度圖
gray_image = image.convert('L')
    # 圖片數據處理
im_data = np.array(gray_image)
    im_data = torch.from_numpy(im_data).float()
    im_data = im_data.view(1, 1, 28, 28)
    # 神經網絡運算
outputs = net(im_data)
    # 取最大預測值
_, pred = torch.max(outputs, 1)
    # print(torch.nn.Softmax(dim=1)(outputs).detach().numpy()[0])
    # print(torch.nn.functional.normalize(outputs).detach().numpy()[0])
    # 顯示圖片
plt.subplot(grid_rows, 5, i + 1)
    plt.imshow(gray_image)
    plt.title(u"你是{}?".format(pred.item()), fontsize=8)
    plt.axis('off')

    print('[{}]預測數字為: [{}]'.format(file, pred.item()))

plt.show()

可視化結果

 

 這批圖片是經過圖片增強后識別的結果,准確率有待提高

3、優化

3.1 更多樣本:

收集難度大

3.2 數據增強:

簡單地處理一下自己手寫的數字圖片

# -*- coding: utf-8 -*-
# encoding:utf-8

import torch
import numpy as np
from PIL import Image
import os
import matplotlib
import matplotlib.pyplot as plt
import glob
from scipy.ndimage import filters

class ImageProcceed:
    def __init__(self, image_folder):
        self.image_folder = image_folder

    def save(self, rotate, filter=None, to_gray=True):
        file_list = glob.glob(os.path.join(self.image_folder, '*.png'))
        print(len(file_list))
        for i, file in enumerate(file_list):
            # 讀取圖片數據
image = Image.open(file)  # .resize((28, 28))
            # 灰度圖
if to_gray == True:
                image = image.convert('L')
            # 旋轉
image = image.rotate(rotate)
            if filter is not None:
                image = filters.gaussian_filter(image, 0.5)
                image = Image.fromarray(image)
            filename = os.path.basename(file)
            fileext = os.path.splitext(filename)[1]
            savefile = filename.replace(fileext, '-rt{}{}'.format(rotate, fileext))
            print(savefile)
            image.save(os.path.join(self.image_folder, savefile))


ip = ImageProcceed('data/myimages/')
ip.save(20, filter=0.5)

3.3 改變網絡大小:

比如把上面的Net類中的3層改為2層

3.4 調參:

改變學習率,訓練更多次數等

 

后面我調整了Net類中的兩個地方,准確率終於達到100%,這只是在我小批量測試集上的表現而已,而現實中預測是不可能達到100%的,每台機器可能有差異,每次運行的結果會有不同,再次帖出代碼

 1 import torch.nn as nn
 2 
 3 
 4 class Net(nn.Module):
 5     def __init__(self):
 6         super(Net, self).__init__()
 7         # 卷積: 1通道輸入,6通道輸出,卷積核5*5,步長1,前后補2個0
 8         # 激活函數一般用ReLU,后面改良的有LeakyReLU/PReLU
 9         # MaxPool2d池化,一般是2
10         self.conv1 = nn.Sequential(nn.Conv2d(1, 6, 5, 1, 2)
11                                    , nn.PReLU()
12                                    , nn.MaxPool2d(2, 2))
13         self.conv2 = nn.Sequential(nn.Conv2d(6, 16, 5)
14                                    , nn.PReLU()
15                                    , nn.MaxPool2d(2, 2))
16         self.fc1 = nn.Sequential(
17             nn.Linear(16 * 5 * 5, 120),  # 卷積輸出16,乘以卷積核5*5
18             # nn.Dropout2d(),  # Dropout接收來自linear的數據,Dropout2d接收來自conv2d的數據
19             nn.PReLU()
20         )
21         self.fc2 = nn.Sequential(
22             nn.Linear(120, 84),
23             nn.Dropout(p=0.2),
24             nn.PReLU()
25         )
26         self.fc3 = nn.Linear(84, 10)  # 輸出層節點為10,代表數字0-9
27 
28     # 前向傳播
29     def forward(self, x):
30         x = self.conv1(x)
31         x = self.conv2(x)
32         # 線性層的輸入輸出都是一維數據,所以要把多維度的tensor展平成一維
33         x = x.view(x.size()[0], -1)
34         x = self.fc1(x)
35         x = self.fc2(x)
36         x = self.fc3(x)
37         return x

上面改了兩個地方,一個是激活函數ReLU改成了PReLU,正則化Dropout用0.2作為參數,下面是再次運行測試后的結果

 

 

 


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM