需求
最近接到一個在屏幕固定位置識別特定字符識別的需求,其中包含0-9數字、納米和微米的單位符號nm和μm。一共14個類別:0-9,n,m,μ和小數點。
如下圖紅色框內所示,識別紅色標記內的字符。
由於場景數據集數量較少,需要生成相應字體的數據集用於訓練。
圖像分類處理
由於字符數量較少,圖像分類可以避免數據標注的繁瑣工作。先嘗試圖像分類的方式,但測試效果很不理想。
python中生成字符圖像可以利用
- opencv中的cv2.putText()
1 img = cv2.putText(pixel, str(_label[0]), (col_position, row_position), cv2.FONT_HERSHEY_PLAIN, 2, (255, 255, 255), 3)
- PIL中的Image.Draw()類中的text()方法
1 draw = ImageDraw.Draw(img_PIL) draw.text(position, character, font=font, fill=fillcolor)
cv2.putText()可以通過cv2.FONT_xxx選擇字體,共包含9種字體,如下圖所示:
通過選擇合適字體,結合底色、位置的隨機可以生成如下圖所示的數字數據集,並將圖像信息和標簽信息保存為.mat格式(由於是圖像分類,只需要圖像信息和坐標即可,.mat更易存儲)。
生成代碼如下:
1 import cv2 2 import numpy as np 3 import os 4 from PIL import Image 5 from scipy import io 6 # 路徑 7 img_path = './data/img' 8 txt_path = './data/txt' 9 if not os.path.exists(img_path): 10 os.makedirs(img_path) 11 if not os.path.exists(txt_path): 12 os.makedirs(txt_path) 13 14 point_img_path = "C:\\Users\\vtstar\\PROJECT_OCR\\data\\model" 15 point_list = os.listdir(point_img_path) 16 point_num = len(point_list) 17 # 初始化 18 num = 10000 19 mat_path = './data' 20 x = np.zeros((32, 32, 3, num+point_num), np.uint8) 21 y = np.zeros((num+point_num, 1), np.int) 22 # 隨機生成 23 for i in range(0, num): 24 # 0-9部分 25 _name = str(i).rjust(6, "0") # 文件名稱 26 _label = np.random.randint(0, 10, 1).astype(int) 27 _gray = np.random.randint(50, 170, 1) # 底色 28 col_position = np.random.randint(0, 17) # 寬的起點 29 row_position = np.random.randint(20, 32) # 高的起點 (左下角點的坐標) 30 # print(_gray) 31 pixel = np.ones((32, 32, 3), np.uint8) * _gray[0] 32 img = cv2.putText(pixel, str(_label[0]), (col_position, row_position), cv2.FONT_HERSHEY_PLAIN, 2, (255, 255, 255), 3) 33 img_dir = os.path.join(img_path, _name+'.jpg') 34 txt_dir = os.path.join(txt_path, _name+'.txt') 35 # save 36 cv2.imwrite(img_dir, img) 37 np.savetxt(txt_dir, _label, fmt="%d") 38 x[:, :, :, i] = img 39 y[i, 0] = _label 40 # 動態打印進度條 41 if (i+1) % 100 == 0: 42 m = (i+1)//100 43 print('\r當前進度:{0}{1}%'.format('▉'*m,(m)), end='') 44 # 保存為.mat格式 45 dic = {'x':x, 'y':y} 46 io.savemat('new_digit_point.mat', dic)
同理,可以生成對應m和n的字符圖像。
但是,通過opencv中包含的字體生成'μ'字符圖像,生成的都是"??"。
PIL Image登場!!!
字體設置:
1 font = ImageFont.truetype('Arvo-Regular.ttf', 30, encoding="gbk")
此處,可以通過'Arvo-Regular.ttf'處調節使用的字體,該字體是系統自帶的字體庫。路徑為:
C:\Windows\Fonts
如下圖所示,字體對應代碼中的名字可以通過屬性查看獲取:
繪制字符:
1 draw = ImageDraw.Draw(img_PIL) 2 character = 'μ' 3 draw.text(position, character, font=font, fill=fillcolor)
便可以生成字符'μ'的圖像,如下圖所示:
生成一張'μ'的圖像如述代碼:
1 from PIL import Image, ImageFont, ImageDraw 2 import numpy as np 3 import cv2 4 def paint_u_opencv(): 5 _gray = np.random.randint(50, 170, 1) 6 img_PIL = Image.new("RGB", (32, 32), (_gray, _gray, _gray)) 7 font = ImageFont.truetype('Arvo-Regular.ttf', 30, encoding="gbk") 8 # 可現實μ的字體: 9 # AdobeHeitiStd-Regular.otf 10 # AdobeFangsongStd-Regular.otf 11 # AdobeGothicStd-Bold.otf * 12 fillcolor = (255, 255, 255) 13 w_position = np.random.randint(0, 16, 1) 14 h_position = np.random.randint(-14, 0, 1) 15 position = (w_position, h_position) 16 draw = ImageDraw.Draw(img_PIL) 17 character = 'μ' 18 draw.text(position, character, font=font, fill=fillcolor) 19 img_PIL = img_PIL.convert('L') 20 img = np.asarray(img_PIL)
通過更改Lenet模型結構,進行訓練,在模擬數據中可以獲得較好的效果,但在實際場景中效果很不理想。
Lenet模型建立
1 import os 2 import numpy as np 3 import tensorflow as tf 4 import tensorflow.contrib.slim as slim 5 from tensorflow.contrib.layers import xavier_initializer 6 import CONFIG as cfg 7 class Model(object): 8 def __init__(self): 9 self.raw_input_image = tf.placeholder(tf.float32, shape=[None, 32, 32, 1]) 10 self.input_image = tf.reshape(self.raw_input_image, shape=[-1, 32, 32, 1]) 11 self.raw_input_label = tf.placeholder(tf.float32, shape=[None, 14]) # 10個數字+一個小數點 12 self.input_label = tf.cast(self.raw_input_label, dtype=tf.float32) 13 self.current_epoch = tf.placeholder(tf.int32) 14 self.keep_prob = cfg.KEEP_PROB 15 self.lr = cfg.LEARNING_RATE 16 17 18 with tf.variable_scope("Model") as scope: 19 self.train_digits = self.model(is_trained=True) 20 scope.reuse_variables() 21 self.test_digits = self.model(is_trained=False) 22 23 # 訓練 24 self.loss = tf.losses.softmax_cross_entropy(logits=self.train_digits, onehot_labels=self.input_label, scope="loss") 25 # print(tf.Session.run(tf.gradients(self.train_digits, self.input_image, ))) 26 self.learning_rate = tf.train.exponential_decay(self.lr, global_step=self.current_epoch, decay_steps=1000, decay_rate=0.985) 27 self.optimizer = tf.train.AdamOptimizer(learning_rate=self.lr).minimize(self.loss) 28 29 # 訓練結果 30 self.predictions = tf.math.argmax(self.train_digits, 1) 31 # self.pre_1 = tf.argmax(self.train_digits, 1) 32 self.correlation = tf.equal(self.predictions, tf.math.argmax(self.input_label, 1)) 33 self.accuracy = tf.reduce_mean(tf.cast(self.correlation, dtype="float")) 34 35 # 測試結果 36 self.predictions_test = tf.math.argmax(self.test_digits, 1) 37 self.correlation_test = tf.equal(self.predictions_test, tf.math.argmax(self.input_label, 1)) 38 self.accuracy_test = tf.reduce_mean(tf.cast(self.correlation_test, "float")) 39 40 def model(self, is_trained=True): 41 with tf.device(":/gpu:0"): 42 with slim.arg_scope([slim.conv2d, slim.fully_connected], 43 weights_initializer=tf.truncated_normal_initializer(stddev=0.1), # stddev太小會導致驗證集精度波動 而且較低 44 weights_regularizer=slim.l2_regularizer(0.05), 45 biases_initializer=tf.constant_initializer(0) 46 ): 47 with slim.arg_scope([slim.conv2d], padding="valid"): 48 net = slim.conv2d(self.input_image, 6, kernel_size=[5, 5], stride=1, scope="conv_1") # 28*28*6 49 net = slim.max_pool2d(net, [2, 2], scope="pool_2") # 14*14*6 50 net = slim.conv2d(net, 16, kernel_size=[5, 5], stride=1, scope="conv_3") # 10*10*16 51 net = slim.max_pool2d(net, [2, 2], scope="pool_4") # 5*5*16 52 net = slim.conv2d(net, 120, kernel_size=[1, 1], stride=1, scope="conv_5") # 53 net = slim.flatten(net, scope='flatten') 54 net = slim.fully_connected(net, 120, scope="fc_6") 55 net = slim.dropout(net, self.keep_prob, is_training=is_trained, scope="dropout_7") 56 net = slim.fully_connected(net, 84, activation_fn=tf.nn.leaky_relu, scope="fc_8") 57 net = slim.dropout(net, self.keep_prob, is_training=is_trained, scope='dropout_9') 58 net = slim.fully_connected(net, 14, activation_fn=None, scope="fc_10") 59 digits = slim.nn.softmax(net) 60 # a = slim.summary 61 # print(a.image()) 62 return digits
lenet模型在GPU上訓練:
1 import os 2 import tensorflow as tf 3 os.environ['TF_CPP_MIN_LOG_LEVEL']='3' 4 from load_img_info import load_img_info 5 from model import Model 6 import datetime 7 import cv2 8 import numpy as np 9 import CONFIG as cfg 10 import time 11 digit_num = 14000 # 控制數據量的大小 12 13 # def rgb2gray_my(img): 14 # h,w,c,n = img.shape 15 # _img = np.zeros((h, w, 1, n)) 16 # for i in range(0, n): 17 # _img[:, :, :, i] = np.reshape(cv2.cvtColor(img[:, :, :, i], cv2.COLOR_RGB2GRAY), (h, w, 1)) 18 # return _img 19 20 def main(): 21 # prepare 22 23 print("prepare") 24 # sess = tf.Session() 25 input_img, input_label = load_img_info() 26 27 idx = np.arange(0, input_img.shape[0], 1) 28 np.random.shuffle(idx) 29 idx_1 = idx[0: int(np.floor(idx.shape[0]*cfg.VALIDATION_RATIO))] # 驗證的索引 30 idx_2 = idx[int(np.floor(idx.shape[0]*cfg.VALIDATION_RATIO)):] # 訓練的索引 31 32 # validation_datasets = input_img[idx_1, :, :, :] # 33 # validation_label = input_label[idx_1, 0] 34 validation_datasets = input_img # 35 validation_label = input_label[:, 0] 36 validation_datasets_x = validation_datasets 37 38 train_datasets = input_img[idx_2, :, :, :] 39 train_label = input_label[idx_2, 0] 40 41 train_datasets_x = train_datasets 42 43 print("finished!") 44 45 iter = cfg.MAX_ITER 46 batch = cfg.BATCH_SIZE 47 checkpoint_dir = os.path.join(cfg.BACK_UP, str(digit_num)) 48 TIMESTAMP = '{0:%Y-%m-%d-%H-%M}'.format(datetime.datetime.now()) 49 checkpoint_dir = os.path.join(checkpoint_dir, TIMESTAMP) 50 51 if not os.path.exists(checkpoint_dir): 52 os.makedirs(checkpoint_dir) 53 54 tf.reset_default_graph() 55 config = tf.ConfigProto(log_device_placement=True) # , allow_soft_placement=True 56 model = Model() 57 # 生成模型保存 58 saver = tf.train.Saver(max_to_keep=1) 59 validation_record = [] 60 max_val_acc = 0 61 with tf.device('/gpu:0'): 62 with tf.Session() as sess: # 不使用tf.Session(config=config) 設置就不會再訓練過程中打印一堆device信息,靠os.environ['TF_CPP_MIN_LOG_LEVEL']='3'限制不住 63 sess.run(tf.global_variables_initializer()) 64 validation_label_y = sess.run(tf.one_hot(validation_label, depth=14)) 65 train_label_y = sess.run(tf.one_hot(train_label, depth=14)) 66 for i in range(iter): 67 # print("Epoch: %d" %(i)) 68 start_time = time.time() 69 # epoch = i 70 # lr_ = sess.run([model.learning_rate], feed_dict={model.current_epoch: epoch}) 71 # learning_rate = sess.run(tf.train.exponential_decay(cfg.LEARNING_RATE, global_step=i, 72 # # decay_steps=1000, decay_rate=0.99)) 73 # sess.run([model.learning_rate], feed_dict={model.learning_rate: learning_rate}) 74 total_batch = int(train_datasets.shape[0] / batch) 75 avg_loss = 0 76 avg_acc = 0 77 for j in range(total_batch): 78 # print("batch:%d" %(j)) 79 if j == total_batch-1: 80 x = train_datasets_x[j*batch:, :, :, :]/255.0 81 y = train_label_y[j*batch:, :] 82 else: 83 x = train_datasets_x[j*batch: (j+1)*batch, :, :, :]/255.0 84 y = train_label_y[j * batch: (j + 1) * batch, :] 85 86 _y = y 87 88 _, loss, lr , acc = sess.run([model.optimizer, model.loss,model.learning_rate, model.accuracy], feed_dict={model.raw_input_image: x, 89 model.raw_input_label: _y, 90 model.current_epoch: i}) 91 avg_loss += loss 92 avg_acc += acc 93 print("Epoch: %d, avg_cost: %g, avg_acc: %f, lr: %f, time_consume: %f" % (i, avg_loss, avg_acc/total_batch, lr, time.time()-start_time)) 94 95 # if (i+1) % 200 == 0: # 每隔兩百代保存一次模型 96 97 if (i+1) % 100 == 0: # 每隔一百代進行一次驗證 98 valid_start_time = time.time() 99 validation_acc = sess.run([model.accuracy_test], feed_dict={model.raw_input_image: validation_datasets_x/255.0, 100 model.raw_input_label: validation_label_y}) 101 # a = sess.run(tf.math.argmax(validation_label_y, 1)) 102 # print(validation_acc) 103 if validation_acc[0] >max_val_acc: 104 max_val_acc = validation_acc 105 saver.save(sess, checkpoint_dir + "\\ocr.ckpt", global_step=i + 1) 106 107 validation_record.append(validation_acc[0]) 108 print("Validation accuracy: %f, Validation time consume: %f" %(validation_acc[0], time.time()-valid_start_time)) 109 print(validation_record) 110 test_acc = sess.run([model.accuracy_test], feed_dict={model.raw_input_image: validation_datasets_x, 111 model.raw_input_label: validation_label_y}) 112 test_acc_1 = sess.run([model.accuracy_test], feed_dict={model.raw_input_image: validation_datasets_x/255.0, 113 model.raw_input_label: validation_label_y}) 114 np.savetxt("validation_record.txt", validation_record) 115 print(test_acc, test_acc_1) 116 if __name__ == '__main__': 117 main()
******************************************************************************************************
目標檢測處理
由於上述生成的圖像數據集使用與圖片分類的方式進行ocr的檢測(使用Lenet、AlexNet或者VGG),需要進行字符分割,字符分割得到的字符形狀不可控,並且生成的數據難以滿足同分布的需求。並且在此場景中,易造成二次誤差,所以決定使用目標檢測的方式直接對特定區域進行ocr字符檢測。只不過使用圖像分類的方式不需要人工投入精力對圖像進行標注...
由於本場景中,字符處於屏幕特定的位置,可以省去CTPN的過程,直接進行檢測。只不過需要人工標注。
預訓練
鑒於PIL Image中可以使用本地的各種字體,更容易獲得到與實際場景字體相同的圖像對模型進行預訓練(實際場景的數據特別少)。再使用實際場景的數據對模型進行微調。
使用實際場景的背景色作為模板,在模板上進行字符數據集的隨機生成。使用模板的圖像與生成的圖像數據集如下圖所示:
通過調整字符之間的距離,使得生成的圖像具有與實際場景相似間隔與黏連的效果,如下圖所示,是實際場景小數點黏連和生成圖像小數點黏連圖像。
並且值得注意的是,實際場景中的小數點是方的,在測試時發現"ARIALN.TTF"字體與實際場景更為相似,通過上圖也可以看出。
模擬圖像數據生成四到五位帶小數點和隨機單位的代碼如下,由於只是進行字符檢測,所以並沒有對生成的模擬圖像中的數字規則進行限定,也就是說會存在諸如092.13的情況。
1 import cv2 2 import numpy as np 3 import random 4 import os 5 import copy 6 from PIL import Image, ImageFont, ImageDraw 7 # 生成4-5位隨機小數 8 9 def img_name(id): 10 c = 0 11 while id != 0: 12 id = id // 10 13 c += 1 14 return c 15 16 17 # img = cv2.imread(".\data\demo\my_test_01.png", 0) 18 # # cv2.imshow("img", img) 19 # h, w = img.shape[0: 2] 20 # print(h, w) 21 save_path = "E:\\VT_PROJECT\\PROJECT_OCR\\data\\new_template\\img" 22 if not os.path.exists(save_path): 23 os.makedirs(save_path) 24 pixel_arr = cv2.imread("E:\\VT_PROJECT\\PROJECT_OCR\\data\\new_template\\background.png", 1) 25 26 # pixel_arr = Image.open("E:\\VT_PROJECT\\PROJECT_OCR\\data\\new_template\\background.png", ) 27 28 for i in range(0, 3000):# 生成3000張圖像 29 # 根據id生成圖像的name 30 _name = str(i).rjust(6, '0') 31 # background 32 pixel_arr_copy = copy.deepcopy(pixel_arr) 33 # 隨機數 34 x = np.arange(10) 35 np.random.shuffle(x) 36 prob = np.random.rand() 37 digital_num = 4 if prob > 0.5 else 5 38 candidate = x[0:digital_num] 39 candidate_num = list(map(str, candidate)) 40 # 小數點的位置 41 point_position = int(np.random.randint(1, digital_num-1, 1)) 42 if candidate_num[0] == 0: 43 candidate_num[1], candidate_num[0] = candidate_num[0], candidate_num[1] 44 candidate_num.insert(point_position, '.') 45 else: 46 candidate_num.insert(point_position, '.') 47 48 # nm or μm 49 character = [["n", "m"], ["μ", "m"]] 50 prob = np.random.rand() 51 uint = character[0] if prob <0.5 else character[1] 52 candidate_num.extend(uint) 53 # simsun.ttc 54 # Arvo-Regular.ttf 55 56 # 繪制准備 57 img = Image.fromarray(pixel_arr_copy) 58 font = ImageFont.truetype('ARIALN.TTF', 60, encoding="gbk") 59 fillcolor = (181, 185, 176) 60 draw = ImageDraw.Draw(img) 61 # position, character, font = font, fill = fillcolor 62 inter_pixel = 22 63 start_w = np.random.randint(5, 30, 1)[0] 64 start_h = np.random.randint(10, 30, 1)[0] 65 flag = False 66 for id, value in enumerate(candidate_num): 67 if value == "." : 68 flag = True 69 inter_pixel = 20 70 elif flag : 71 flag = False 72 inter_pixel = 7 73 elif value == "n" or value == "μ": 74 inter_pixel = 32 75 elif value == "m": 76 inter_pixel = 24 77 else: 78 inter_pixel = 22 79 start_w += inter_pixel 80 draw.text((start_w, start_h), value, font=font, fill=fillcolor, stroke_width=1) 81 img_array = np.array(img) 82 img_dir = os.path.join(save_path, _name+".jpg") 83 cv2.imwrite(img_dir, img_array)
生成對應的數據集:
先對模擬數據集使用labelImg進行標注,生成900份訓練數據:
通過生成的模擬數據進行模型訓練(使用YOLOv3-tiny)可以達到在模擬數據集中很好的測試效果,但在實際場景中仍會存在漏檢和誤檢等精度上的問題。
此時模型具有識別字符的能力,但分布與實際場景會有所不同。
模型微調
對實際場景中圖像進行截取,一張圖像可截取多次,形成實際場景的數據(110份),並進行標注。
在模擬數據集訓練得到的權重上進行再訓練(微調)。在模型微調中,如果對於黏連小數點存在漏檢的情況,可以在訓練集中增大小數點黏連情況的比重。
測試效果如下: