參考:https://blog.csdn.net/u012222949/article/details/72875281
參考:https://blog.csdn.net/chengshuhao1991/article/details/78656724
參考:https://zhuanlan.zhihu.com/p/27238630
tfrecords文件的存儲:
將其他數據存儲為tfrecord文件的時候,需要進行兩個步驟:
建立tfrecord存儲器
構造每個樣本的Example模塊
1、構建tfrecord存儲器
實現建立存儲器的函數為:
tf.python_io.TFRecordWriter(path) #寫入tfrecord文件 #path為tfrecord的存儲路徑
2、構造每個樣本的example模塊
Example協議塊的規則如下:
message Example { Features features = 1; }; message Features { map<string, Feature> feature = 1; }; message Feature { oneof kind { BytesList bytes_list = 1; FloatList float_list = 2; Int64List int64_list = 3; } };
其中實現的幾個函數如下所示:
def _int64_feature(value): return tf.train.Feature(int64_list=tf.train.Int64List(value=[value])) tf.train.Example(features=tf.train.Features(feature={'i':_int64_feature(1),'j':_int64_feature(2)}))
#或者直接寫成
tf.train.Example(features=tf.train.Features(feature={'i':tf.train.Feature(int64_list=tf.train.Int64List(value=[1])),'j':tf.train.Feature(int64_list=tf.train.Int64List(value=[2]))})) #返回結果如下 features { feature { key: "i" value { int64_list { value: 1 } } } feature { key: "j" value { int64_list { value: 2 } } } }
tf.train.Example(features = None) #用於寫入tfrecords文件 #features : tf.train.Features類型的特征實例 #返回example協議格式塊
tf.train.Features(feature = None) #用於構造每個樣本的信息鍵值對 #feature : 字典數據,key為要保存的名字,value為tf.train.Feature實例 #返回Features類型
tf.train.Feature(**options) #options可選的三種數據格式: bytes_list = tf.train.BytesList(value = [Bytes]) int64_list = tf.train.Int64List(value = [Value]) float_list = tf.trian.FloatList(value = [Value])
writer=tf.python_io.TFRecordWriter(filename) example=tf.train.Example(features=tf.train.Features(feature={'i':_int64_feature(i),'j':_int64_feature(j)})) writer.write(example.SerializeToString()) #序列轉換成字符串 #如上讀文件與如下寫文件對應 filename_queue=tf.train.string_input_producer(files,shuffle=False) #傳入文件名list,系統將其轉化為文件名queue reader=tf.TFRecordReader() _,serialized=reader.read(filename_queue) features=tf.parse_single_example(serialized,features={'i':tf.FixedLenFeature([],tf.int64),'j':tf.FixedLenFeature([],tf.int64)}) #tf.TFRecordReader()的parse_single_example()解析器,用於將Example協議內存塊解析為張量 i,j=features['i'],features['j']
最終將圖片數據轉換成tfrecords的例子,即對每個樣本都作如下處理:
example = tf.train.Example(feature = tf.train.Features(feature= {"image":tf.train.Feature(bytes_list=tf.train.BytesList(value=[image(bytes)]))
,"label":tf.train.Feature(int64_list=tf.train.Int64List(value=[label(int)]))}))
例1、將圖片文件轉換成tfrecord文件(具體代碼實現):
import matplotlib.pyplot as plt import matplotlib.image as mpimg import numpy as np import tensorflow as tf import pandas as pd def get_label_from_filename(filename): return 1 filenames = tf.train.match_filenames_once('C:/Users/1/Desktop/3/*.jpg') writer = tf.python_io.TFRecordWriter('C:/Users/1/Desktop/png_train.tfrecords') with tf.Session() as sess: #使用match_filenames_once函數需要用tf.local_variables_initializer()函數來實現變量的初始化 sess.run([tf.global_variables_initializer(),tf.local_variables_initializer()]) filenames=(sess.run(filenames)) print(filenames)
#獲取的字符串為前面帶b:bytes的字符串,類似於字符串前帶u:unicode的字符串
#其中從字符串轉化成unicode編碼的過稱為:str.decode('utf-8'),從unicode轉化成字符串為:str.encode('utf-8'),因此對如下做同樣操作 for filename in filenames: img=mpimg.imread(filename.decode('utf-8')) print("{} shape is {}".format(filename, img.shape)) img_raw = img.tostring() label = get_label_from_filename(filename) example = tf.train.Example( features=tf.train.Features( feature={ "image_raw": tf.train.Feature(bytes_list=tf.train.BytesList(value=[image_raw])), "label": tf.train.Feature(int64_list=tf.train.Int64List(value=[label])) } ) ) writer.write(record=example.SerializeToString()) #序列轉換成字符串 writer.close()
glob包的介紹:
用於獲取所有匹配的文件路徑列表
import glob glob.glob("/home/zikong/doc/*.doc") #返回結果如下: /home/zikong/doc/file1.doc /home/zikong/doc/file2.doc
例2、tfrecord文件的生成:
from random import shuffle import numpy as np import glob import tensorflow as tf import cv2 import sys import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' shuffle_data = True image_path = '/path/to/image/*.jpg' # 取得該路徑下所有圖片的路徑,type(addrs)= list addrs = glob.glob(image_path) # 標簽數據的獲得具體情況具體分析,type(labels)= list labels = ... # 這里是打亂數據的順序 if shuffle_data: c = list(zip(addrs, labels)) #將兩列元素進行組合 shuffle(c) #random包的shuffle函數進行打亂處理 addrs, labels = zip(*c) #將組合后的元素再進行拆分 # 按需分割數據集 train_addrs = addrs[0:int(0.7*len(addrs))] train_labels = labels[0:int(0.7*len(labels))] val_addrs = addrs[int(0.7*len(addrs)):int(0.9*len(addrs))] val_labels = labels[int(0.7*len(labels)):int(0.9*len(labels))] test_addrs = addrs[int(0.9*len(addrs)):] test_labels = labels[int(0.9*len(labels)):] # 上面不是獲得了image的地址么,下面這個函數就是根據地址獲取圖片 def load_image(addr): # A function to Load image img = cv2.imread(addr) img = cv2.resize(img, (224, 224), interpolation=cv2.INTER_CUBIC) img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) # 這里/255是為了將像素值歸一化到[0,1] img = img / 255. img = img.astype(np.float32) return img # 將數據轉化成對應的屬性 def _int64_feature(value): return tf.train.Feature(int64_list=tf.train.Int64List(value=[value])) def _bytes_feature(value): return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value])) def _float_feature(value): return tf.train.Feature(float_list=tf.train.FloatList(value=[value])) # 下面這段就開始把數據寫入TFRecods文件 train_filename = '/path/to/train.tfrecords' # 輸出文件地址 # 創建一個writer來寫 TFRecords 文件 writer = tf.python_io.TFRecordWriter(train_filename) for i in range(len(train_addrs)): # 這是寫入操作可視化處理 if not i % 1000: print('Train data: {}/{}'.format(i, len(train_addrs))) sys.stdout.flush() # 加載圖片 img = load_image(train_addrs[i]) label = train_labels[i] # 創建一個屬性(feature) feature = {'train/label': _int64_feature(label), 'train/image': _bytes_feature(tf.compat.as_bytes(img.tostring()))} # 創建一個 example protocol buffer example = tf.train.Example(features=tf.train.Features(feature=feature)) # 將上面的example protocol buffer寫入文件 writer.write(example.SerializeToString()) #序列轉換成字符串 writer.close() sys.stdout.flush()
例3、從MNIST輸入數據轉化為TFRecord的格式,以及將如何讀取TFRecords文件中的數據
從MNIST輸入數據轉化為TFRecord格式:
import tensorflow as tf from tensorflow.examples.tutorials.mnist import input_data import numpy as np def _int64_feature(value): return tf.train.Feature(int64_list=tf.train.Int64List(value=[value])) def _bytes_feature(value): return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value])) mnist=input_data.read_data_sets('C:/Users/1/Desktop/data',dtype=tf.uint8,one_hot=True) images=mnist.train.images labels=mnist.train.labels pixels=images.shape[1] num_examples=mnist.train.num_examples #輸出TFRecord文件地址 filename='C:/Users/1/Desktop/data/output.tfrecords' writer=tf.python_io.TFRecordWriter(filename) for index in range(num_examples): image_raw=images[index].tostring() example=tf.train.Example(features=tf.train.Features(feature={'pixels':_int64_feature(pixels),'label':_int64_feature(np.argmax(labels[index])),'image_raw':_bytes_feature(image_raw)})) writer.write(example.SerializeToString()) #序列轉換成字符串 writer.close()
以上程序部分將MNIST數據集中所有的訓練數據存儲到TFRecord文件中,當數據量較大時,也可以將數據寫入多個TFRecord文件
例4、讀取TFRecord文件中的數據:
import tensorflow as tf reader=tf.TFRecordReader() filename_queue=tf.train.string_input_producer(['C:/Users/1/Desktop/data/output.tfrecords']) #tf.train.string_input_producer()和下面的tf.train.start_queue_runners()相對應,前者創建輸入隊列,后者啟動隊列 _,serialized_example=reader.read(filename_queue) #從文件中讀取一個樣例 features=tf.parse_single_example(serialized_example,features={'image_raw':tf.FixedLenFeature([],tf.string),'pixels':tf.FixedLenFeature([],tf.int64),'label':tf.FixedLenFeature([],tf.int64)}) #tf.FixedLenFeature()函數解析得到的結果是一個Tensor images=tf.decode_raw(features['image_raw'],tf.uint8) #tf.decode_raw用於將字符串轉換成unit8的張量 labels=tf.cast(features['label'],tf.int32) #將目標變量轉換成tf.int32格式 pixels=tf.cast(features['pixels'],tf.int32) #tf.decode_raw可以將字符串解析成圖像對應的像素數組 sess=tf.Session() coord=tf.train.Coordinator() threads=tf.train.start_queue_runners(sess=sess,coord=coord) for i in range(10): image,label,pixel=sess.run([images,labels,pixels])
例5、另一個讀寫TFRecord的例子
import tensorflow as tf def _int64_feature(value): #寫TFRecord文件 return tf.train.Feature(int64_list=tf.train.Int64List(value=[value])) num_shards=2 instances_per_shard=2 for i in range(num_shards): filename=('C:/Users/1/Desktop/data/data.tfrecords-%.5d-of-%.5d' %(i,num_shards)) writer=tf.python_io.TFRecordWriter(filename) for j in range(instances_per_shard): example=tf.train.Example(features=tf.train.Features(feature={'i':_int64_feature(i),'j':_int64_feature(j)})) writer.write(example.SerializeToString()) writer.close() #讀TFRecord文件 files=tf.train.match_filenames_once('C:/Users/1/Desktop/data/data.tfrecords-*') filename_queue=tf.train.string_input_producer(files,shuffle=False) reader=tf.TFRecordReader() _,serialized_example=reader.read(filename_queue) features=tf.parse_single_example(serialized_example,features={'i':tf.FixedLenFeature([],tf.int64),'j':tf.FixedLenFeature([],tf.int64)}) #tf.parse_single_example用於將Example協議內存塊解析為張量
#tf.FixedLenFeature用於解析定長的輸入特征feature with tf.Session() as sess: tf.global_variables_initializer().run() tf.local_variables_initializer().run() print(sess.run(files)) coord=tf.train.Coordinator() #創建Coordinator類來協同不同線程 threads=tf.train.start_queue_runners(sess=sess,coord=coord) #啟動所有線程 for i in range(6): print(sess.run([features['i'],features['j']])) coord.request_stop() #請求該線程終止 coord.join(threads) #等待被指定的線程終止
組合訓練數據:
參考:http://blog.sina.com.cn/s/blog_6ca0f5eb0102wppn.html
#接上 train,label=features['i'],features['j'] train_batch,label_batch=tf.train.batch([train,label],batch_size=3,capacity=1003) #batch_size用於調整一個batch中樣本的維度 with tf.Session() as sess: sess.run(tf.global_variables_initializer()) sess.run(tf.local_variables_initializer()) coord=tf.train.Coordinator() threads=tf.train.start_queue_runners(sess=sess,coord=coord) for i in range(2): cur_train,cur_label=sess.run([train_batch,label_batch]) print(cur_train,cur_label) coord.request_stop() coord.join(threads)
tf.train.batch函數用於將單個的數據組織成3個一組的batch,再提供給神經網絡的輸入層。返回的函數維度為:[batch_size,tensor.shape],看下面的例子將對該函數有更好的理解。
import tensorflow as tf tensor_list = [[1,2,3,4], [5,6,7,8],[9,10,11,12],[13,14,15,16],[17,18,19,20]] tensor_list2 = [[[1,2,3,4]], [[5,6,7,8]],[[9,10,11,12]],[[13,14,15,16]],[[17,18,19,20]]] tensor_list3=[1,2,3,4] with tf.Session() as sess: x1 = tf.train.batch(tensor_list, batch_size=3, enqueue_many=False) x2 = tf.train.batch(tensor_list, batch_size=3, enqueue_many=True) y1 = tf.train.batch_join(tensor_list, batch_size=3, enqueue_many=False) y2 = tf.train.batch_join(tensor_list2, batch_size=3, enqueue_many=True) z1=tf.train.batch(tensor_list3,batch_size=3,enqueue_many=False) coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(sess=sess, coord=coord) print("x1 batch:"+"-"*10) print(sess.run(x1)) print("x2 batch:"+"-"*10) print(sess.run(x2)) print("y1 batch:"+"-"*10) print(sess.run(y1)) print("y2 batch:"+"-"*10) print(sess.run(y2)) print("-"*10) print(sess.run(z1)) print("-"*10) coord.request_stop() coord.join(threads)
返回結果如下:
x1 batch:---------- [array([[1, 2, 3, 4], #返回的維度為[batch_size,tensor.shape],這里的batch_size=3 [1, 2, 3, 4], [1, 2, 3, 4]]), array([[5, 6, 7, 8], [5, 6, 7, 8], [5, 6, 7, 8]]), array([[ 9, 10, 11, 12], [ 9, 10, 11, 12], [ 9, 10, 11, 12]]), array([[13, 14, 15, 16], [13, 14, 15, 16], [13, 14, 15, 16]]), array([[17, 18, 19, 20], [17, 18, 19, 20], [17, 18, 19, 20]])] x2 batch:---------- [array([1, 2, 3]), array([5, 6, 7]), array([ 9, 10, 11]), array([13, 14, 15]), array([17, 18, 19])] y1 batch:---------- [array([1, 9, 5]), array([ 2, 10, 6]), array([ 3, 11, 7]), array([ 4, 12, 8])] y2 batch:---------- [1 2 3] ---------- [array([1, 1, 1]), array([2, 2, 2]), array([3, 3, 3]), array([4, 4, 4])] #返回的維度同樣為[batch_size,tensor.shape],但是由於輸入數據格式為[1,2,3,4],因而返回的維度體現在對1的維度轉換[1,1,1],也即[batch_size,tensor.shape] ----------