1、實現邏輯
package com.vue.demo.service.serviceimpl; import com.vue.demo.service.OCRService; import net.sourceforge.tess4j.Tesseract; import net.sourceforge.tess4j.TesseractException; import net.sourceforge.tess4j.util.ImageHelper; import org.apache.commons.io.FileUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.springframework.stereotype.Service; import org.springframework.web.multipart.MultipartFile; import javax.imageio.ImageIO; import java.awt.image.BufferedImage; import java.io.File; import java.io.IOException; /** * @author yangwj * @date 2020/4/1 9:29 */ @Service public class OCRServiceImpl implements OCRService { private static final Logger ocrServiceImplLog = LoggerFactory.getLogger(OCRServiceImpl.class); String language = ""; /** * 方法一 * @param file * @return */ @Override public String getCharacterFromPic(MultipartFile file) { // String modelPath = "D:\\software\\ocr-tesseract\\tessdata"; String modelPath = "/root/project/java/tesseract_model"; Tesseract tessreact = new Tesseract(); //需要指定訓練集 訓練集到 https://github.com/tesseract-ocr/tessdata 下載。 tessreact.setDatapath(modelPath); if(language.equals("ch")) { //注意 默認是英文識別,如果做中文識別,需要單獨設置。 tessreact.setLanguage("chi_sim"); } try { File imageFile = new File(file.getOriginalFilename()); FileUtils.copyInputStreamToFile(file.getInputStream(), imageFile); String result = tessreact.doOCR(imageFile); ocrServiceImplLog.info(result); System.out.println("----------------"); String handleResult = this.ocr(imageFile,modelPath); ocrServiceImplLog.info(handleResult); return result+"----------------------------------\n\r"+handleResult; } catch (TesseractException e) { System.err.println(e.getMessage()); } catch (IOException e) { e.printStackTrace(); } return null; } @Override public String getLanguage(String language) { if(language == null || language == "" ) { return null; } this.language = language; return "success"; } /** * 方法二 * @param file * @param modelPath * @return */ private String ocr(File file,String modelPath) { String result = null; try { double start = System.currentTimeMillis(); BufferedImage textImage = ImageIO.read(file); // 這里對圖片黑白處理,增強識別率.這里先通過截圖,截取圖片中需要識別的部分 textImage = ImageHelper.convertImageToGrayscale(textImage); // 圖片銳化 textImage = ImageHelper.convertImageToBinary(textImage); // 圖片放大倍數,增強識別率(很多圖片本身無法識別,放大5倍時就可以輕易識,但是考濾到客戶電腦配置低,針式打印機打印不連貫的問題,這里就放大5倍) textImage = ImageHelper.getScaledInstance(textImage, textImage.getWidth() * 1, textImage.getHeight() * 1); textImage = ImageHelper.convertImageToBinary(textImage); String saveImgPath = "/root/project/java/tesseract_model/temp_img"; // String saveImgPath = "D:\\software\\ocr-tesseract\\img_tem\\temp.img"; ImageIO.write(textImage, "png", new File(saveImgPath)); Tesseract instance = new Tesseract(); //設置訓練庫的位置 // String modelPath = "/root/project/java/tesseract_model"; instance.setDatapath(modelPath); //中文識別 instance.setLanguage("chi_sim"); result = instance.doOCR(textImage); double end = System.currentTimeMillis(); System.out.println("耗時" + (end - start) / 1000 + " s"); } catch (Exception e) { e.printStackTrace(); } return result; } }
2、部署到centos,遇到的問題,可以看這篇