什么是OCR識別

OCR是指對文本資料進行掃描后對圖像文件進行分析處理，獲取文字及版面信息的過程。用Opencv進行OCR識別時，通常分為兩步：掃描、識別。

如何進行OCR識別

整體流程

1.讀取圖像
2.預處理（灰度--二值）
3.邊緣檢測
4.輪廓檢測
5.輪廓近似
6.透視變換
7.OCR識別
8.展示結果

具體實現

Step1:邊緣檢測

1.讀取圖像

import cv2
import numpy as np
# 讀取輸入
image = cv2.imread('receipt.jpg')
#坐標也會相同變化
ratio = image.shape[0] / 500.0
orig = image.copy()
image = resize(orig, height = 500)

將輸入的圖像resize成高為500的等比例縮小的圖像，所以之后所獲取的關鍵點的坐標也是基於resize后的圖像得到的，故在這里我們需要先知道圖片resize的比例，稱為ratio.
同時，為了方便，在這里寫了一個函數進行resize操作，如下：

def resize(image,width=None,height=None,inter=cv2.INTER_AREA):
    dim=None
    (h,w)=image.shape[:2]
    if width is None and height is None:
        return image
    if width is None:
        r=height/float(h)
        dim=(int(w*r),height)
    else:
        r=width/float(w)
        dim=(width,int(h*r))
    
    resized=cv2.resize(image,dim,interpolation=inter)
    return resized

2.預處理

#預處理
#灰度轉換
gray=cv2.cvtColor(image,cv2.COLOR_RGB2GRAY)
#高斯濾波--去噪點
gray=cv2.GaussianBlur(gray,(5,5),0)

#邊緣檢測
edged=cv2.Canny(gray,75,200)

imshow("canny",edged);

Step2:獲取輪廓

#輪廓檢測--面積最大的輪廓就是需要的
cnts,hierarchy=cv2.findContours(edged.copy(),cv2.RETR_LIST,cv2.CHAIN_APPROX_SIMPLE)
cnts=sorted(cnts,key=cv2.contourArea,reverse=True)[:5]

#遍歷輪廓
for c in cnts:
    #計算輪廓近似
    peri=cv2.arcLength(c,True)
    approx=cv2.approxPolyDP(c,0.02*peri,True)
    
    #4個點的時候就拿出來
    if len(approx)==4:
        screenCnt=approx
        break
#顯示結果
print("STEP2:獲取輪廓")
cv2.drawContours(image,[screenCnt],-1,(0,0,255),2,LINE_AA)
imshow("outline",image);

Step3:透視變換

透視變換的基本原理

首先引入兩個函數完成透視變換的操作。
（1）order_points(pts)函數：將上一步得到的輪廓的四個頂點按照左上，右上，右下，左下的順序排序。
其原理為：首先計算每個點所包含的兩個坐標的和，最小的為左上角的點，最大的為右下角的點。然后計算每個點所包含的兩個坐標的差，最小的為右上角的點，最大的為左下角的點。

def order_points(pts):
    # 一共4個坐標點
    rect = np.zeros((4, 2), dtype = "float32")

    # 按順序找到對應坐標0123分別是 左上，右上，右下，左下
    # 計算左上，右下
    s = pts.sum(axis = 1)
    rect[0] = pts[np.argmin(s)]
    rect[2] = pts[np.argmax(s)]

    # 計算右上和左下
    diff = np.diff(pts, axis = 1)
    rect[1] = pts[np.argmin(diff)]
    rect[3] = pts[np.argmax(diff)]

    return rect

（2）four_point_transform(image, pts)：透視變換。

def four_point_transform(image, pts):
    # 獲取輸入坐標點
    rect = order_points(pts)
    (tl, tr, br, bl) = rect

    # 計算輸入的w和h值
    widthA = np.sqrt(((br[0] - bl[0]) ** 2) + ((br[1] - bl[1]) ** 2))
    widthB = np.sqrt(((tr[0] - tl[0]) ** 2) + ((tr[1] - tl[1]) ** 2))
    maxWidth = max(int(widthA), int(widthB))

    heightA = np.sqrt(((tr[0] - br[0]) ** 2) + ((tr[1] - br[1]) ** 2))
    heightB = np.sqrt(((tl[0] - bl[0]) ** 2) + ((tl[1] - bl[1]) ** 2))
    maxHeight = max(int(heightA), int(heightB))

    # 變換后對應坐標位置（-1只是為了防止有誤差出現，不-1也可以。）
    dst = np.array([
        [0, 0],
        [maxWidth - 1, 0],
        [maxWidth - 1, maxHeight - 1],
        [0, maxHeight - 1]], dtype = "float32")

    # 計算變換矩陣
    M = cv2.getPerspectiveTransform(rect, dst)
    warped = cv2.warpPerspective(image, M, (maxWidth, maxHeight))

    # 返回變換后結果
    return warped

# 透視變換
warped = four_point_transform(orig, screenCnt.reshape(4, 2) * ratio)
# 二值處理
warped = cv2.cvtColor(warped, cv2.COLOR_BGR2GRAY)
ref = cv2.threshold(warped, 100, 255, cv2.THRESH_BINARY)[1]
cv2.imwrite('scan.jpg', ref)
# 展示結果
print("STEP 3: 變換")
cv2.imshow("Scanned", ref)
cv2.waitKey(0)

變換后的結果：

Step4:OCR識別

1.下載
https://digi.bib.uni-mannheim.de/tesseract/
選擇一個版本進行下載

2.安裝
下載完成后打開一路next安裝完成

3.環境變量配置
將剛剛安裝的目錄添加到環境變量中
（可以在命令行窗口（cmd）中輸入tesseract -v進行測試，會輸出版本號。）

4.OCR識別測試
在命令行窗口中輸入tesseract scan.jpg result，會將剛才掃描的圖片上的信息寫入result.txt文件中。

5.在python中實現

5.1 先安裝pytesseract---pip install pytesseract
5.2 在庫文件夾中找到pytesseract文件夾里的pytesseract.py文件，打開，修改里面tesseract_cmd一行為絕對路徑。

6.在python中引入相關的庫

from PIL import Image
import pytesseract
import cv2
import os

7.執行識別操作

preprocess = 'blur' #thresh

image = cv2.imread('scan.jpg')
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

if preprocess == "thresh":
    gray = cv2.threshold(gray, 0, 255,cv2.THRESH_BINARY | cv2.THRESH_OTSU)[1]

if preprocess == "blur":
    gray = cv2.medianBlur(gray, 3)
    
filename = "{}.png".format(os.getpid())
cv2.imwrite(filename, gray)
    
text = pytesseract.image_to_string(Image.open(filename))
print(text)
os.remove(filename)

8.最終效果

完整代碼

#ocr_demo.py   輸入參數:--image D:\image\paper.jpg
import cv2
import argparse
import numpy as np
import imutils

ap=argparse.ArgumentParser()
ap.add_argument("-i","--image",required=True,help="path to input image")
args=vars(ap.parse_args())

def resize(image,width=None,height=None,inter=cv2.INTER_AREA):
    dim=None
    (h,w)=image.shape[:2]
    if width is None and height is None:
        return image
    if width is None:
        r=height/float(h)
        dim=(int(w*r),height)
    else:
        r=width/float(w)
        dim=(width,int(h*r))
    
    resized=cv2.resize(image,dim,interpolation=inter)
    return resized

def order_point(pts):
    #一共4個坐標點
    rect=np.zeros((4,2),dtype="float32")
    
    #按順序找到對應坐標，左上，右上，右下，左下
    #計算左上，右下
    s=pts.sum(axis=1)
    rect[0]=pts[np.argmin(s)]
    rect[2]=pts[np.argmax(s)]

    #計算右上和左下
    diff=np.diff(pts,axis=1)
    rect[1]=pts[np.argmin(diff)]
    rect[3]=pts[np.argmax(diff)]
    
    return rect

def four_point_transform(image,pts):
    #獲取輸入的坐標點
    rect=order_point(pts)
    (tl,tr,br,bl)=rect
    
    #計算輸入的w和h值
    #w---x的平方+y的平方開根號   取比較大的那個
    widthA=np.sqrt(((br[0]-bl[0])**2)+((br[1]-bl[1])**2))
    widthB=np.sqrt(((tr[0]-tl[0])**2)+((tr[1]-tl[1])**2))
    maxWidth=max(int(widthA),int(widthB))
    
    #h同理
    heightA=np.sqrt(((tr[0]-br[0])**2)+((tr[1]-br[1])**2))
    heightB=np.sqrt(((tl[0]-bl[0])**2)+((tl[1]-bl[1])**2))
    maxHeight=max(int(heightA),int(heightB))
    
    #變換后對應坐標的位置
    dst=np.array([[0,0],[maxWidth-1,0],[maxWidth-1,maxHeight-1],[0,maxHeight-1]],dtype="float32")
    #dst=np.array([[0,0],[500,0],[666,500],[0,666]],dtype="float32")
    #計算變換矩陣
    #2維----3維---2維
    M=cv2.getPerspectiveTransform(rect,dst)
    warped=cv2.warpPerspective(image,M,(maxWidth,maxHeight))
    
    return warped

#讀取圖片
image=cv2.imread(args["image"])


ratio=image.shape[0]/500.0
orig=image.copy()

image=resize(orig,height=500)


#預處理
gray=cv2.cvtColor(image,cv2.COLOR_RGB2GRAY)
gray=cv2.GaussianBlur(gray,(5,5),0)

#邊緣檢測
edged=cv2.Canny(gray,75,200)

imshow("canny",edged);


#輪廓檢測
cnts,hierarchy=cv2.findContours(edged.copy(),cv2.RETR_LIST,cv2.CHAIN_APPROX_SIMPLE)
cnts=sorted(cnts,key=cv2.contourArea,reverse=True)[:5]


#遍歷輪廓
for c in cnts:
    #計算輪廓近似
    peri=cv2.arcLength(c,True)
    approx=cv2.approxPolyDP(c,0.02*peri,True)
    
    #4個點的時候就拿出來
    
    if len(approx)==4:
        screenCnt=approx
        break
    
#顯示結果
print("STEP2:獲取輪廓")
cv2.drawContours(image,[screenCnt],-1,(0,0,255),2,LINE_AA)
imshow("outline",image);

#cv.Circle(img, center, radius, color, thickness=1, lineType=8, shift=0) → None

#透視變換
wraped=four_point_transform(orig,screenCnt.reshape(4,2)*ratio)

#二值處理
wraped=cv2.cvtColor(wraped,cv2.COLOR_RGB2GRAY)
ref=cv2.threshold(wraped,100,255,cv2.THRESH_BINARY)[1]
cv2.imwrite("scan.jpg",ref)

#展示結果
print("STEP3:變換")
imshow("Original",resize(orig,width=500))
imshow("Scanned",ref)
waitKey()

#ocr.py
from PIL import Image
import pytesseract
import cv2
import os
from cv2 import waitKey

preprocess='blur'

image=cv2.imread("scan.jpg")
gray=cv2.cvtColor(image,cv2.COLOR_RGB2GRAY)

if preprocess=="thresh":
    gray=cv2.threshold(gray,0,255,cv2.THRESH_BINARY|cv2.THRESH_OTSU)[1]
    
if preprocess=="blur":
    gray=cv2.medianBlur(gray,3)
    
filename="{}.png".format(os.getpid())
cv2.imwrite(filename,gray)

text=pytesseract.image_to_string(Image.open(filename))

print(text)
os.remove(filename)

cv2.imshow("Image",image)
cv2.imshow("output",gray)

waitKey()

免責聲明！

本站轉載的文章為個人學習借鑒使用，本站對版權不負任何法律責任。如果侵犯了您的隱私權益，請聯系本站郵箱yoyou2525@163.com刪除。

猜您在找 OpenCV--文檔掃描OCR識別自己搭建傳統ocr識別項目學習【OCR技術系列之四】基於深度學習的文字識別【實戰】基於OpenCV的水表字符識別（OCR） OCR識別 OCR識別 OpenCV學習(38) 人臉識別(3) OpenCV學習(36) 人臉識別(1) python3學習--安裝OCR識別庫tesserocr 《深度實踐OCR 基於深度學習的文字識別》筆記