使用Keras訓練大規模數據集

本文轉載自查看原文 2018-07-26 12:26 5374 deep learning/ keras

官方提供的.flow_from_directory(directory)函數可以讀取並訓練大規模訓練數據，基本可以滿足大部分需求。但是在有些場合下，需要自己讀取大規模數據以及對應標簽，下面提供一種方法。

步驟0：導入相關

import random
import numpy as np
from keras.preprocessing.image import load_img,img_to_array
from keras.preprocessing.image import ImageDataGenerator
from keras.models import Model

步驟1：准備數據

#訓練集樣本路徑
train_X = ["train/cat_1.jpg",
           "train/cat_2.jpg",
           "train/cat_3.jpg",
           "train/dog_1.jpg",
           "train/dog_2.jpg",
           "train/dog_3.jpg"]
#驗證集樣本路徑
val_X =   ["val/cat_1.jpg",
           "val/cat_2.jpg",
           "val/cat_3.jpg",
           "val/dog_1.jpg",
           "val/dog_2.jpg",
           "val/dog_3.jpg"]

# 根據圖片路徑獲取圖片標簽
def get_img_label(img_paths):
    img_labels = []
    
    for img_path in img_paths:     
        animal = img_path.split("/")[-1].split('_')[0]
        if animal=='cat':
            img_labels.append(0)
        else:
            img_labels.append(1)
        
    return img_labels 

# 讀取圖片
def load_batch_image(img_path, train_set = True, target_size=(224, 224)):
    im = load_img(img_path, target_size=target_size)
    if train_set:
        return img_to_array(im) #converts image to numpy array
    else:
        return img_to_array(im)/255.0
# 建立一個數據迭代器
def GET_DATASET_SHUFFLE(X_samples, batch_size, train_set = True):
    random.shuffle(X_samples)
        
    batch_num = int(len(X_samples) / batch_size)
    max_len = batch_num * batch_size
    X_samples = np.array(X_samples[:max_len])
    y_samples = get_img_label(X_samples)
    print(X_samples.shape)
     
    X_batches = np.split(X_samples, batch_num)
    y_batches = np.split(y_samples, batch_num)

    for i in range(len(X_batches)):
        if train_set:
            x = np.array(list(map(load_batch_image, X_batches[i], [True for _ in range(batch_size)])))
        else:
            x = np.array(list(map(load_batch_image, X_batches[i], [False for _ in range(batch_size)])))
        #print(x.shape)
        y = np.array(y_batches[i])
        yield x,y

步驟2：對訓練數據進行數據增強處理

train_datagen = ImageDataGenerator(
    rescale=1. / 255,
    rotation_range=10,
    width_shift_range=0.2,
    height_shift_range=0.2,
    shear_range=0.2,
    zoom_range=0.2,
    horizontal_flip=True)

步驟3：定義模型

model = Model(...)

步驟4：模型訓練

n_epoch = 12
batch_size = 16
for e in range(n_epoch):
    print("epoch", e)
    batch_num = 0
    loss_sum=np.array([0.0,0.0])
    for X_train, y_train in GET_DATASET_SHUFFLE(train_X, batch_size, True): # chunks of 100 images 
        for X_batch, y_batch in train_datagen.flow(X_train, y_train, batch_size=batch_size): # chunks of 32 samples
            loss = model.train_on_batch(X_batch, y_batch)
            loss_sum += loss 
            batch_num += 1
            break #手動break
        if batch_num%200==0:
            print("epoch %s, batch %s: train_loss = %.4f, train_acc = %.4f"%(e, batch_num, loss_sum[0]/200, loss_sum[1]/200))
            loss_sum=np.array([0.0,0.0])
    res = model.evaluate_generator(GET_DATASET_SHUFFLE(val_X, batch_size, False),int(len(val_X)/batch_size))
    print("val_loss = %.4f, val_acc = %.4f: "%( res[0], res[1]))

    model.save("weight.h5")

另外，如果在訓練的時候不需要做數據增強處理，那么訓練就更加簡單了，如下：

model.fit_generator(
  GET_DATASET_SHUFFLE(train_X, batch_size, True),
  epochs=10,
  steps_per_epoch=int(len(train_X)/batch_size))

參考文獻：

Training on Large Scale Image Datasets with Keras

免責聲明！

本站轉載的文章為個人學習借鑒使用，本站對版權不負任何法律責任。如果侵犯了您的隱私權益，請聯系本站郵箱yoyou2525@163.com刪除。

猜您在找 大規模數據爬取 -- Python C++ 大規模數據排序(100G數據使用 4G 內存排序) 大規模數據如何實現數據的高效追溯？爬蟲大規模數據采集心得和示例大規模數據從SQL SERVER導入到ORACLE方法 hbase大規模數據寫入的優化歷程 Flink在大規模狀態數據集下的checkpoint調優訓練自己的數據集如何使用yolov3訓練自己的數據集 python大規模數據處理技巧之一：數據常用操作