文件說明:
1. image.py——圖像處理函數,主要是特征提取;
2. model_training.py——訓練CNN單字識別模型(需要較高性能的服務器,最好有GPU加速,否則真是慢得要死);
3. ocr.py——識別函數,包括單字分割、前面訓練好的模型進行單字識別、動態規划提升效果;
4. main.py——主文件,用來調用1、3兩個文件。
5、模型中包含的字.txt(UTF-8編碼);
文件1:image.py
# -*- coding:utf-8 -*- import numpy as np from scipy import misc,ndimage from scipy.stats import gaussian_kde as kde from tqdm import * def myread(filename): #讀取圖像,放大兩倍,做平方變換 print u'讀取圖片中...' pic = misc.imread(filename, flatten = True) pic = ndimage.zoom(pic, 2) pic = pic**2 pic = ((pic-pic.min())/(pic.max()-pic.min())*255).round() print u'讀取完成.' return pic def decompose(pic): #核密度聚類,給出極大值、極小值點、背景顏色、聚類圖層 print u'圖層聚類分解中...' d0 = kde(pic.reshape(-1), bw_method=0.2)(range(256)) #核密度估計 d = np.diff(d0) d1 = np.where((d[:-1]<0)*(d[1:]>0))[0] #極小值 d1 = [0]+list(d1)+[256] d2 = np.where((d[:-1]>0)*(d[1:]<0))[0] #極大值 if d1[1] < d2[0]: d2 = [0]+list(d2) if d1[len(d1)-2] > d2[len(d2)-1]: d2 = list(d2)+[255] dc = sum(map(lambda i: d2[i]*(pic >= d1[i])*(pic < d1[i+1]), range(len(d2)))) print u'分解完成. 共%s個圖層'%len(d2) return dc def erosion_test(dc): #抗腐蝕能力測試 print u'抗腐蝕能力測試中...' layers = [] #bg = np.argmax(np.bincount(dc.reshape(-1))) #d = [i for i in np.unique(dc) if i != bg] d = np.unique(dc) for k in d: f = dc==k label_im, nb_labels = ndimage.label(f, structure=np.ones((3,3))) #划分連通區域 ff = ndimage.binary_erosion(f) #腐蝕操作 def test_one(i): index = label_im==i if (1.0*ff[index].sum()/f[index].sum() > 0.9) or (1.0*ff[index].sum()/f[index].sum() < 0.1): f[index] = False ff = map(test_one, trange(1, nb_labels+1)) layers.append(f) print u'抗腐蝕能力檢測完畢.' return layers def pooling(layers): #以模仿池化的形式整合特征 print u'整合分解的特征中...' result = sum(layers) label_im, nb_labels = ndimage.label(result, structure=np.ones((3,3))) def pool_one(i): index = label_im==i k = np.argmax([1.0*layers[j][index].sum()/result[index].sum() for j in range(len(layers))]) result[index] = layers[k][index] t = map(pool_one, trange(1, nb_labels+1)) print u'特征整合成功.' return result def post_do(pic): label_im, nb_labels = ndimage.label(pic, structure=np.ones((3,3))) print u'圖像的后期去噪中...' def post_do_one(i): index = label_im==i index2 = ndimage.find_objects(index)[0] ss = 1.0 * len(pic.reshape(-1))/len(pic[index2].reshape(-1))**2 #先判斷是否低/高密度區,然后再判斷是否孤立區。 if (index.sum()*ss < 16) or ((1+len(pic[index2].reshape(-1))-index.sum())*ss < 16): pic[index] = False else: a,b,c,d = index2[0].start, index2[0].stop, index2[1].start, index2[1].stop index3 = (slice(max(0, 2*a-b),min(pic.shape[0], 2*b-a)), slice(max(0, 2*c-d),min(pic.shape[1], 2*d-c))) if (pic[index3].sum() == index.sum()) and (1.0*index.sum()/(b-a)/(d-c) > 0.75): pic[index2] = False t = map(post_do_one, trange(1, nb_labels+1)) print u'后期去噪完成.' return pic def areas(pic): #圈出候選區域 print u'正在生成候選區域...' pic_ = pic.copy() label_im, nb_labels = ndimage.label(pic_, structure=np.ones((3,3))) def areas_one(i): index = label_im==i index2 = ndimage.find_objects(index)[0] pic_[index2] = True t = map(areas_one, trange(1, nb_labels+1)) return pic_ #定義距離函數,返回值是距離和方向 #注意distance(o1, o2)與distance(o2, o1)的結果是不一致的 def distance(o1, o2): delta = np.array(o2[0])-np.array(o1[0]) d = np.abs(delta)-np.array([(o1[1]+o2[1])/2.0, (o1[2]+o2[2])/2.0]) d = np.sum(((d >= 0)*d)**2) theta = np.angle(delta[0]+delta[1]*1j) k = 1 if np.abs(theta) <= np.pi/4: k = 4 elif np.abs(theta) >= np.pi*3/4: k = 2 elif np.pi/4 < theta < np.pi*3/4: k = 1 else: k = 3 return d, k def integrate(pic, k=0): #k=0是全向膨脹,k=1僅僅水平膨脹 label_im, nb_labels = ndimage.label(pic, structure=np.ones((3,3))) def integrate_one(i): index = label_im==i index2 = ndimage.find_objects(index)[0] a,b,c,d = index2[0].start, index2[0].stop, index2[1].start, index2[1].stop cc = ((a+b)/2.0,(c+d)/2.0) return (cc, b-a, d-c) print u'正在確定區域屬性...' A = map(integrate_one, trange(1, nb_labels+1)) print u'區域屬性已經確定,正在整合鄰近區域...' aa,bb = pic.shape pic_ = pic.copy() def areas_one(i): dist = [distance(A[i-1], A[j-1]) for j in range(1, nb_labels+1) if i != j] dist = np.array(dist) ext = dist[np.argsort(dist[:,0])[0]] #通過排序找最小,得到最鄰近區域 if ext[0] <= (min(A[i-1][1],A[i-1][2])/4)**2: ext = int(ext[1]) index = label_im==i index2 = ndimage.find_objects(index)[0] a,b,c,d = index2[0].start, index2[0].stop, index2[1].start, index2[1].stop if ext == 1: #根據方向來膨脹 pic_[a:b, c:min(d+(d-c)/4,bb)] = True elif ext == 3: pic_[a:b, max(c-(d-c)/4,0):d] = True elif ext == 4 and k == 0: pic_[a:min(b+(b-a)/6,aa), c:d] = True #基於橫向排版假設,橫向膨脹要大於豎向膨脹 elif k == 0: pic_[max(a-(b-a)/6,0):b, c:d] = True t = map(areas_one, trange(1, nb_labels+1)) print u'整合完成.' return pic_ def cut_blank(pic): #切除圖片周圍的白邊,返回范圍 try: q = pic.sum(axis=1) ii,jj = np.where(q!= 0)[0][[0,-1]] xi = (ii, jj+1) q = pic.sum(axis=0) ii,jj = np.where(q!= 0)[0][[0,-1]] yi = (ii, jj+1) return [xi, yi] except: return [(0,1),(0,1)] def trim(pic, pic_, prange=5): #剪除白邊,刪除太小的區域 label_im, nb_labels = ndimage.label(pic_, structure=np.ones((3,3))) def trim_one(i): index = label_im==i index2 = ndimage.find_objects(index)[0] box = (pic*index)[index2] [(a1,b1), (c1,d1)] = cut_blank(box) pic_[index] = False if (b1-a1 < prange) or (d1-c1 < prange) or ((b1-a1)*(d1-c1) < prange**2): #刪除小區域 pass else: #恢復剪除白邊后的區域 a,b,c,d = index2[0].start, index2[0].stop, index2[1].start, index2[1].stop pic_[a+a1:a+b1,c+c1:c+d1] = True t = map(trim_one, trange(1, nb_labels+1)) return pic_ def bound(m): frange = (slice(m.shape[0]-1), slice(m.shape[1]-1)) f0 = np.abs(np.diff(m, axis=0)) f1 = np.abs(np.diff(m, axis=1)) f2 = np.abs(m[frange]-m[1:,1:]) f3 = f0[frange]+f1[frange]+f2[frange] != 0 return f3 def trim_bound(pic, pic_): #剪除白邊,刪除太小的區域 pic_ = pic_.copy() label_im, nb_labels = ndimage.label(pic_, structure=np.ones((3,3))) def trim_one(i): index = label_im==i index2 = ndimage.find_objects(index)[0] box = pic[index2] if 1.0 * bound(box).sum()/box.sum() < 0.15: pic_[index] = False t = map(trim_one, trange(1, nb_labels+1)) return pic_
文件2:model_training.py
# -*- coding:utf-8 -*- import numpy as np from PIL import Image, ImageFont, ImageDraw import pandas as pd import glob #包含的漢字列表(太長,僅僅截取了一部分) hanzi = u'0123456789AaBbCcDdEeFfGgHhIiJjKkLlMmNnOoPpQqRrSsTtUuVvWwXxYyZz的一是不人有了在你我個大中要這為上生時會以就子到來可能和自們年多發心好用家出關長他成天對也小后下學都點國過地行信方得最說二業分作如看女於面注別經動公開現而美么還事' #生成文字矩陣 def gen_img(text, size=(48,48), fontname='simhei.ttf', fontsize=48): im = Image.new('1', size, 1) dr = ImageDraw.Draw(im) font = ImageFont.truetype(fontname, fontsize) dr.text((0, 0), text, font=font) return (((np.array(im.getdata()).reshape(size)==0)+(np.random.random(size)<0.05)) != 0).astype(float) #生成訓練樣本 data = pd.DataFrame() fonts = glob.glob('./*.[tT][tT]*') for fontname in fonts: print fontname for i in range(-2,3): m = pd.DataFrame(pd.Series(list(hanzi)).apply(lambda s:[gen_img(s, fontname=fontname, fontsize=48+i)])) m['label'] = range(3062) data = data.append(m, ignore_index=True) m = pd.DataFrame(pd.Series(list(hanzi)).apply(lambda s:[gen_img(s, fontname=fontname, fontsize=48+i)])) m['label'] = range(3062) data = data.append(m, ignore_index=True) x = np.array(list(data[0])).astype(float) np.save('x', x) #保存訓練數據 dic=dict(zip(range(3062),list(hanzi))) #構建字表 from keras.models import Sequential from keras.layers.core import Dense, Dropout, Activation, Flatten from keras.layers.convolutional import Convolution2D, MaxPooling2D from keras.utils import np_utils batch_size = 1024 nb_classes = 3062 nb_epoch = 30 img_rows, img_cols = 48, 48 # number of convolutional filters to use nb_filters = 64 # size of pooling area for max pooling nb_pool = 2 # convolution kernel size nb_conv = 4 x = np.load('x.npy') y = np_utils.to_categorical(range(3062)*45*5*2, nb_classes) weight = ((3062-np.arange(3062))/3062.0+1)**3 weight = dict(zip(range(3063),weight/weight.mean())) #調整權重,高頻字優先 model = Sequential() model.add(Convolution2D(nb_filters, nb_conv, nb_conv, border_mode='valid', input_shape=(1, img_rows, img_cols))) model.add(Activation('relu')) model.add(MaxPooling2D(pool_size=(nb_pool, nb_pool))) model.add(Dropout(0.25)) model.add(Convolution2D(nb_filters, nb_conv, nb_conv)) model.add(Activation('relu')) model.add(MaxPooling2D(pool_size=(nb_pool, nb_pool))) model.add(Dropout(0.25)) model.add(Flatten()) model.add(Dense(1024)) model.add(Activation('relu')) model.add(Dropout(0.5)) model.add(Dense(nb_classes)) model.add(Activation('softmax')) model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy']) history = model.fit(x, y, batch_size=batch_size, nb_epoch=nb_epoch, class_weight=weight) score = model.evaluate(x,y) print('Test score:', score[0]) print('Test accuracy:', score[1]) model.save_weights('model.model')
文件3:ocr.py
# -*- coding:utf-8 -*- import numpy as np from scipy import misc from images import cut_blank #包含的漢字列表(太長了,僅截取了一部分) hanzi = u'0123456789AaBbCcDdEeFfGgHhIiJjKkLlMmNnOoPpQqRrSsTtUuVvWwXxYyZz的一是不人有了在你我個大中要這為上生時會以就子到來可能和自們年多發心好用家出關長他成天對也小后下學都點國過地行信方得最說二業分作如看女於面注別經動公開現而美么還事' dic=dict(zip(range(3062),list(hanzi))) #構建字表 from keras.models import Sequential from keras.layers.core import Dense, Dropout, Activation, Flatten from keras.layers.convolutional import Convolution2D, MaxPooling2D from keras.utils import np_utils batch_size = 128 nb_classes = 3062 img_rows, img_cols = 48, 48 nb_filters = 64 nb_pool = 2 nb_conv = 4 model = Sequential() model.add(Convolution2D(nb_filters, nb_conv, nb_conv, border_mode='valid', input_shape=(1, img_rows, img_cols))) model.add(Activation('relu')) model.add(MaxPooling2D(pool_size=(nb_pool, nb_pool))) model.add(Dropout(0.25)) model.add(Convolution2D(nb_filters, nb_conv, nb_conv)) model.add(Activation('relu')) model.add(MaxPooling2D(pool_size=(nb_pool, nb_pool))) model.add(Dropout(0.25)) model.add(Flatten()) model.add(Dense(1024)) model.add(Activation('relu')) model.add(Dropout(0.5)) model.add(Dense(nb_classes)) model.add(Activation('softmax')) model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy']) model.load_weights('ocr.model') import pandas as pd zy = pd.read_csv('zhuanyi.csv', encoding='utf-8', header=None) zy.set_index(0, inplace=True) zy = zy[1] def viterbi(nodes): paths = nodes[0] for l in range(1,len(nodes)): paths_ = paths.copy() paths = {} for i in nodes[l].keys(): nows = {} for j in paths_.keys(): try: nows[j+i]= paths_[j]*nodes[l][i]*zy[j[-1]+i] except: nows[j+i]= paths_[j]*nodes[l][i]*zy[j[-1]+'XX'] k = np.argmax(nows.values()) paths[nows.keys()[k]] = nows.values()[k] return paths.keys()[np.argmax(paths.values())] # mode為direact和search #前者直接給出識別結果,后者給出3個字及其概率(用來動態規划) def ocr_one(m, mode='direact'): m = m[[slice(*i) for i in cut_blank(m)]] if m.shape[0] >= m.shape[1]: p = np.zeros((m.shape[0],m.shape[0])) p[:,:m.shape[1]] = m else: p = np.zeros((m.shape[1],m.shape[1])) x = (m.shape[1]-m.shape[0])/2 p[:m.shape[0],:] = m m = misc.imresize(p,(46,46), interp='nearest') #這步和接下來幾步,歸一化圖像為48x48 p = np.zeros((48, 48)) p[1:47,1:47] = m m = p m = 1.0 * m / m.max() k = model.predict(np.array([[m]]), verbose=0)[0] ks = k.argsort() if mode == 'direact': if k[ks[-1]] > 0.5: return dic[ks[-1]] else: return '' elif mode == 'search': return {dic[ks[-1]]:k[ks[-1]],dic[ks[-2]]:k[ks[-2]],dic[ks[-3]]:k[ks[-3]]} ''' #直接調用Tesseract import os def ocr_one(m): misc.imsave('tmp.png', m) os.system('tesseract tmp.png tmp -l chi_sim -psm 10') s = open('tmp.txt').read() os.system('rm tmp.txt \n rm tmp.png') return s.strip() ''' def cut_line(pl): #mode為direact或viterbi pl = pl[[slice(*i) for i in cut_blank(pl)]] pl0 = pl.sum(axis=0) pl0 = np.where(pl0==0)[0] if len(pl0) > 0: pl1=[pl0[0]] t=[pl0[0]] for i in pl0[1:]: if i-pl1[-1] == 1: t.append(i) pl1[-1]=i else: pl1[-1] = sum(t)/len(t) t = [i] pl1.append(i) pl1[-1] = sum(t)/len(t) pl1 = [0] + pl1 + [pl.shape[1]-1] cut_position = [1.0*(pl1[i+1]-pl1[i-1])/pl.shape[0] > 1.2 for i in range(1,len(pl1)-1)] cut_position=[pl1[1:-1][i] for i in range(len(pl1)-2) if cut_position[i]] #簡單的切割算法 cut_position = [0] + cut_position + [pl.shape[1]-1] else: cut_position = [0, pl.shape[1]-1] l = len(cut_position) for i in range(1, l): j = int(round(1.0*(cut_position[i]-cut_position[i-1])/pl.shape[0])) ab = (cut_position[i]-cut_position[i-1])/max(j,1) cut_position = cut_position + [k*ab+cut_position[i-1] for k in range(1, j)] cut_position.sort() return pl, cut_position def ocr_line(pl, mode='viterbi'): #mode為direact或viterbi pl, cut_position = cut_line(pl) if mode == 'viterbi': text = map(lambda i: ocr_one(pl[:,cut_position[i]:cut_position[i+1]+1], mode='search'), range(len(cut_position)-1)) return viterbi(text) elif mode == 'direact': text = map(lambda i: ocr_one(pl[:,cut_position[i]:cut_position[i+1]+1]), range(len(cut_position)-1)) ''.join(text)
文件4:main.py
# -*- coding:utf-8 -*- from scipy import ndimage print u'加載圖片工具中...' from images import * print u'加載OCR模型中...' from ocr import * print u'加載完畢.' if __name__ == '__main__': filename = '../cn.jpg' p = myread(filename) dc = decompose(p) layers = erosion_test(dc) result = pooling(layers) result = post_do(result) result_ = areas(result) result_ = integrate(result_, 1) result_ = trim(result, result_) result_ = integrate(result_, 1) result_ = trim(result, result_, 10) result_ = trim_bound(result, result_) label_im, nb_labels = ndimage.label(result_, structure=np.ones((3,3))) for i in range(1, nb_labels+1): index = label_im==i index2 = ndimage.find_objects(index)[0] print ocr_line(result[index2])