TFRecords文件的生成和讀取（1）

本文轉載自查看原文 2018-04-01 10:43 9121 預處理函數類

參考：https://blog.csdn.net/u012222949/article/details/72875281

參考：https://blog.csdn.net/chengshuhao1991/article/details/78656724

參考：https://zhuanlan.zhihu.com/p/27238630

tfrecords文件的存儲：

將其他數據存儲為tfrecord文件的時候，需要進行兩個步驟：

建立tfrecord存儲器

構造每個樣本的Example模塊

1、構建tfrecord存儲器

實現建立存儲器的函數為：

tf.python_io.TFRecordWriter(path)  
#寫入tfrecord文件
#path為tfrecord的存儲路徑

2、構造每個樣本的example模塊

Example協議塊的規則如下：

message Example {
  Features features = 1;
};
message Features {
  map<string, Feature> feature = 1;
};
message Feature {
  oneof kind {
    BytesList bytes_list = 1;
    FloatList float_list = 2;
    Int64List int64_list = 3;
  }
};

其中實現的幾個函數如下所示：

def _int64_feature(value):
    return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))
tf.train.Example(features=tf.train.Features(feature={'i':_int64_feature(1),'j':_int64_feature(2)}))
#或者直接寫成
tf.train.Example(features=tf.train.Features(feature={'i':tf.train.Feature(int64_list=tf.train.Int64List(value=[1])),'j':tf.train.Feature(int64_list=tf.train.Int64List(value=[2]))}))
#返回結果如下
features {
  feature {
    key: "i"
    value {
      int64_list {
        value: 1
      }
    }
  }
  feature {
    key: "j"
    value {
      int64_list {
        value: 2
      }
    }
  }
}

tf.train.Example(features = None)  
#用於寫入tfrecords文件
#features ： tf.train.Features類型的特征實例
#返回example協議格式塊

tf.train.Features(feature = None)
#用於構造每個樣本的信息鍵值對
#feature : 字典數據，key為要保存的名字，value為tf.train.Feature實例
#返回Features類型

tf.train.Feature(**options) 
#options可選的三種數據格式：
bytes_list = tf.train.BytesList(value = [Bytes])
int64_list = tf.train.Int64List(value = [Value])
float_list = tf.trian.FloatList(value = [Value])

writer=tf.python_io.TFRecordWriter(filename)
example=tf.train.Example(features=tf.train.Features(feature={'i':_int64_feature(i),'j':_int64_feature(j)}))
writer.write(example.SerializeToString())          #序列轉換成字符串
#如上讀文件與如下寫文件對應
filename_queue=tf.train.string_input_producer(files,shuffle=False)     #傳入文件名list，系統將其轉化為文件名queue
reader=tf.TFRecordReader()
_,serialized=reader.read(filename_queue)
features=tf.parse_single_example(serialized,features={'i':tf.FixedLenFeature([],tf.int64),'j':tf.FixedLenFeature([],tf.int64)})    #tf.TFRecordReader()的parse_single_example()解析器，用於將Example協議內存塊解析為張量
i,j=features['i'],features['j']

最終將圖片數據轉換成tfrecords的例子，即對每個樣本都作如下處理：

example = tf.train.Example(feature = tf.train.Features(feature= {"image":tf.train.Feature(bytes_list=tf.train.BytesList(value=[image(bytes)]))
,"label":tf.train.Feature(int64_list=tf.train.Int64List(value=[label(int)]))}))

例1、將圖片文件轉換成tfrecord文件（具體代碼實現）：

import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import numpy as np
import tensorflow as tf
import pandas as pd
def get_label_from_filename(filename):
    return 1
filenames = tf.train.match_filenames_once('C:/Users/1/Desktop/3/*.jpg')
writer = tf.python_io.TFRecordWriter('C:/Users/1/Desktop/png_train.tfrecords')
with tf.Session() as sess:    #使用match_filenames_once函數需要用tf.local_variables_initializer()函數來實現變量的初始化
    sess.run([tf.global_variables_initializer(),tf.local_variables_initializer()])
    filenames=(sess.run(filenames))      
print(filenames)        
#獲取的字符串為前面帶b：bytes的字符串，類似於字符串前帶u：unicode的字符串
#其中從字符串轉化成unicode編碼的過稱為：str.decode('utf-8')，從unicode轉化成字符串為：str.encode('utf-8')，因此對如下做同樣操作    
for filename in filenames:    
    img=mpimg.imread(filename.decode('utf-8'))
    print("{} shape is {}".format(filename, img.shape))
    img_raw = img.tostring()
    label = get_label_from_filename(filename)
    example = tf.train.Example(
        features=tf.train.Features(
            feature={
                "image_raw": tf.train.Feature(bytes_list=tf.train.BytesList(value=[image_raw])),
                "label": tf.train.Feature(int64_list=tf.train.Int64List(value=[label]))
            }
        )
    )
    writer.write(record=example.SerializeToString())        #序列轉換成字符串
writer.close()

glob包的介紹：

用於獲取所有匹配的文件路徑列表

import glob
glob.glob("/home/zikong/doc/*.doc")
#返回結果如下：
/home/zikong/doc/file1.doc     /home/zikong/doc/file2.doc

例2、tfrecord文件的生成：

from random import shuffle  
import numpy as np  
import glob  
import tensorflow as tf  
import cv2  
import sys  
import os  

os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'   
shuffle_data = True  
image_path = '/path/to/image/*.jpg'   
# 取得該路徑下所有圖片的路徑，type（addrs）= list  
addrs = glob.glob(image_path)  
# 標簽數據的獲得具體情況具體分析，type（labels）= list  
labels = ...  
  
# 這里是打亂數據的順序  
if shuffle_data:  
    c = list(zip(addrs, labels))   #將兩列元素進行組合  
    shuffle(c)               #random包的shuffle函數進行打亂處理
    addrs, labels = zip(*c)    #將組合后的元素再進行拆分 # 按需分割數據集  
train_addrs = addrs[0:int(0.7*len(addrs))]  
train_labels = labels[0:int(0.7*len(labels))]  
  
val_addrs = addrs[int(0.7*len(addrs)):int(0.9*len(addrs))]  
val_labels = labels[int(0.7*len(labels)):int(0.9*len(labels))]  
  
test_addrs = addrs[int(0.9*len(addrs)):]  
test_labels = labels[int(0.9*len(labels)):]  
  
# 上面不是獲得了image的地址么，下面這個函數就是根據地址獲取圖片  
def load_image(addr):  # A function to Load image  
    img = cv2.imread(addr)  
    img = cv2.resize(img, (224, 224), interpolation=cv2.INTER_CUBIC)  
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)  
    # 這里/255是為了將像素值歸一化到[0，1]  
    img = img / 255.  
    img = img.astype(np.float32)  
    return img  
  
# 將數據轉化成對應的屬性  
def _int64_feature(value):    
    return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))  
def _bytes_feature(value):  
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))  
def _float_feature(value):  
    return tf.train.Feature(float_list=tf.train.FloatList(value=[value]))  
# 下面這段就開始把數據寫入TFRecods文件  
train_filename = '/path/to/train.tfrecords'  # 輸出文件地址   
# 創建一個writer來寫 TFRecords 文件  
writer = tf.python_io.TFRecordWriter(train_filename)  
  
for i in range(len(train_addrs)):  
    # 這是寫入操作可視化處理  
    if not i % 1000:  
        print('Train data: {}/{}'.format(i, len(train_addrs)))  
        sys.stdout.flush()  
    # 加載圖片  
    img = load_image(train_addrs[i])  
  
    label = train_labels[i]  
  
    # 創建一個屬性（feature）  
    feature = {'train/label': _int64_feature(label),  
               'train/image': _bytes_feature(tf.compat.as_bytes(img.tostring()))}  
  
    # 創建一個 example protocol buffer  
    example = tf.train.Example(features=tf.train.Features(feature=feature))  
  
    # 將上面的example protocol buffer寫入文件  
    writer.write(example.SerializeToString())          #序列轉換成字符串
  
writer.close()  
sys.stdout.flush()

例3、從MNIST輸入數據轉化為TFRecord的格式，以及將如何讀取TFRecords文件中的數據

從MNIST輸入數據轉化為TFRecord格式：

import tensorflow as tf
from tensorflow.examples.tutorials.mnist import input_data
import numpy as np
def _int64_feature(value):
    return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))
def _bytes_feature(value):
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))
mnist=input_data.read_data_sets('C:/Users/1/Desktop/data',dtype=tf.uint8,one_hot=True)
images=mnist.train.images
labels=mnist.train.labels
pixels=images.shape[1]
num_examples=mnist.train.num_examples
#輸出TFRecord文件地址
filename='C:/Users/1/Desktop/data/output.tfrecords'
writer=tf.python_io.TFRecordWriter(filename)
for index in range(num_examples):
    image_raw=images[index].tostring()
    example=tf.train.Example(features=tf.train.Features(feature={'pixels':_int64_feature(pixels),'label':_int64_feature(np.argmax(labels[index])),'image_raw':_bytes_feature(image_raw)}))
    writer.write(example.SerializeToString())        #序列轉換成字符串
writer.close()

以上程序部分將MNIST數據集中所有的訓練數據存儲到TFRecord文件中，當數據量較大時，也可以將數據寫入多個TFRecord文件

例4、讀取TFRecord文件中的數據：

import tensorflow as tf
reader=tf.TFRecordReader()
filename_queue=tf.train.string_input_producer(['C:/Users/1/Desktop/data/output.tfrecords'])   #tf.train.string_input_producer()和下面的tf.train.start_queue_runners()相對應，前者創建輸入隊列，后者啟動隊列
_,serialized_example=reader.read(filename_queue)  #從文件中讀取一個樣例
features=tf.parse_single_example(serialized_example,features={'image_raw':tf.FixedLenFeature([],tf.string),'pixels':tf.FixedLenFeature([],tf.int64),'label':tf.FixedLenFeature([],tf.int64)})
#tf.FixedLenFeature()函數解析得到的結果是一個Tensor
images=tf.decode_raw(features['image_raw'],tf.uint8)      #tf.decode_raw用於將字符串轉換成unit8的張量
labels=tf.cast(features['label'],tf.int32)   #將目標變量轉換成tf.int32格式
pixels=tf.cast(features['pixels'],tf.int32)
#tf.decode_raw可以將字符串解析成圖像對應的像素數組
sess=tf.Session()
coord=tf.train.Coordinator()
threads=tf.train.start_queue_runners(sess=sess,coord=coord)
for i in range(10):
    image,label,pixel=sess.run([images,labels,pixels])

例5、另一個讀寫TFRecord的例子

import tensorflow as tf  
def _int64_feature(value):    #寫TFRecord文件
    return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))
num_shards=2
instances_per_shard=2
for i in range(num_shards):
    filename=('C:/Users/1/Desktop/data/data.tfrecords-%.5d-of-%.5d' %(i,num_shards))
    writer=tf.python_io.TFRecordWriter(filename)
    for j in range(instances_per_shard):
        example=tf.train.Example(features=tf.train.Features(feature={'i':_int64_feature(i),'j':_int64_feature(j)}))     
        writer.write(example.SerializeToString())
    writer.close()
#讀TFRecord文件
files=tf.train.match_filenames_once('C:/Users/1/Desktop/data/data.tfrecords-*')
filename_queue=tf.train.string_input_producer(files,shuffle=False)
reader=tf.TFRecordReader()
_,serialized_example=reader.read(filename_queue)
features=tf.parse_single_example(serialized_example,features={'i':tf.FixedLenFeature([],tf.int64),'j':tf.FixedLenFeature([],tf.int64)})   #tf.parse_single_example用於將Example協議內存塊解析為張量   
#tf.FixedLenFeature用於解析定長的輸入特征feature
with tf.Session() as sess:  
    tf.global_variables_initializer().run()
    tf.local_variables_initializer().run()
    print(sess.run(files))
    coord=tf.train.Coordinator()       #創建Coordinator類來協同不同線程
    threads=tf.train.start_queue_runners(sess=sess,coord=coord)   #啟動所有線程 for i in range(6):
        print(sess.run([features['i'],features['j']]))
    coord.request_stop()        #請求該線程終止
    coord.join(threads)        #等待被指定的線程終止

組合訓練數據：

參考：http://blog.sina.com.cn/s/blog_6ca0f5eb0102wppn.html

#接上
train,label=features['i'],features['j']
train_batch,label_batch=tf.train.batch([train,label],batch_size=3,capacity=1003)    #batch_size用於調整一個batch中樣本的維度
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    sess.run(tf.local_variables_initializer())
    coord=tf.train.Coordinator()
    threads=tf.train.start_queue_runners(sess=sess,coord=coord)
    for i in range(2):
        cur_train,cur_label=sess.run([train_batch,label_batch])
        print(cur_train,cur_label)
    coord.request_stop()
    coord.join(threads)

tf.train.batch函數用於將單個的數據組織成3個一組的batch，再提供給神經網絡的輸入層。返回的函數維度為：[batch_size，tensor.shape]，看下面的例子將對該函數有更好的理解。

import tensorflow as tf
tensor_list = [[1,2,3,4], [5,6,7,8],[9,10,11,12],[13,14,15,16],[17,18,19,20]]
tensor_list2 = [[[1,2,3,4]], [[5,6,7,8]],[[9,10,11,12]],[[13,14,15,16]],[[17,18,19,20]]]
tensor_list3=[1,2,3,4]
with tf.Session() as sess:
    x1 = tf.train.batch(tensor_list, batch_size=3, enqueue_many=False)
    x2 = tf.train.batch(tensor_list, batch_size=3, enqueue_many=True)
    y1 = tf.train.batch_join(tensor_list, batch_size=3, enqueue_many=False)
    y2 = tf.train.batch_join(tensor_list2, batch_size=3, enqueue_many=True)
    z1=tf.train.batch(tensor_list3,batch_size=3,enqueue_many=False)
    coord = tf.train.Coordinator()
    threads = tf.train.start_queue_runners(sess=sess, coord=coord)
    print("x1 batch:"+"-"*10)
    print(sess.run(x1))
    print("x2 batch:"+"-"*10)
    print(sess.run(x2))
    print("y1 batch:"+"-"*10)
    print(sess.run(y1))
    print("y2 batch:"+"-"*10)
    print(sess.run(y2))
    print("-"*10)
    print(sess.run(z1))
    print("-"*10)
    coord.request_stop()
    coord.join(threads)

返回結果如下：

x1 batch:----------
[array([[1, 2, 3, 4],               #返回的維度為[batch_size，tensor.shape]，這里的batch_size=3
       [1, 2, 3, 4],
       [1, 2, 3, 4]]), array([[5, 6, 7, 8],
       [5, 6, 7, 8],
       [5, 6, 7, 8]]), array([[ 9, 10, 11, 12],
       [ 9, 10, 11, 12],
       [ 9, 10, 11, 12]]), array([[13, 14, 15, 16],
       [13, 14, 15, 16],
       [13, 14, 15, 16]]), array([[17, 18, 19, 20],
       [17, 18, 19, 20],
       [17, 18, 19, 20]])]
x2 batch:----------
[array([1, 2, 3]), array([5, 6, 7]), array([ 9, 10, 11]), array([13, 14, 15]), array([17, 18, 19])]
y1 batch:----------
[array([1, 9, 5]), array([ 2, 10,  6]), array([ 3, 11,  7]), array([ 4, 12,  8])]
y2 batch:----------
[1 2 3]
----------
[array([1, 1, 1]), array([2, 2, 2]), array([3, 3, 3]), array([4, 4, 4])]     #返回的維度同樣為[batch_size，tensor.shape]，但是由於輸入數據格式為[1,2,3,4]，因而返回的維度體現在對1的維度轉換[1,1,1]，也即[batch_size，tensor.shape] ----------

免責聲明！

本站轉載的文章為個人學習借鑒使用，本站對版權不負任何法律責任。如果侵犯了您的隱私權益，請聯系本站郵箱yoyou2525@163.com刪除。

猜您在找 tensorflow二進制文件讀取與tfrecords文件讀取使用tensorflow中的Dataset來讀取制作好的tfrecords文件由淺入深之Tensorflow(3)----數據讀取之TFRecords tensorflowxun訓練自己的數據集之從tfrecords讀取數據讀取txt文件並生成詞雲圖 C#讀取excel文件，並生成json 讀取TensorBoard生成的日志文件內容 PHP 生成配置文件及讀取配置文件方法讀取txt文件中的內容，生成Word文檔 java讀取字符串,生成txt文件