對於類似以下簡單的驗證碼的識別方案:
1、
2
3
4、
1、建庫:切割驗證碼為單個字符,人工標記,比如:A。
2、識別:給一個驗證碼:切割為單個字符,在庫中查詢識別。
/*** * author:chzeze * 識別驗證碼並返回 * train_path 驗證碼字母圖庫位置 * 驗證碼圖片緩存位置:Configuration.getProperties("web_save_path")+"/captcha.jpg" */ public class AmGetCaptchaTest { private static Logger logger = Logger.getLogger(AmGetCaptchaTest.class); private static String train_path = "/data/sata/share_sata/AmazonCrawl/amazonWeb/captcha"; private static Map<BufferedImage, String> trainMap = null; private static int index = 0; private static int imgnum = 0; private static MultiThreadedHttpConnectionManager httpConnectionManager = new MultiThreadedHttpConnectionManager(); private static HttpClient client = new HttpClient(httpConnectionManager); /* static { //每主機最大連接數和總共最大連接數,通過hosfConfiguration設置host來區分每個主機 client.getHttpConnectionManager().getParams().setDefaultMaxConnectionsPerHost(8); client.getHttpConnectionManager().getParams().setMaxTotalConnections(48); client.getHttpConnectionManager().getParams().setConnectionTimeout(10000); client.getHttpConnectionManager().getParams().setSoTimeout(10000); client.getHttpConnectionManager().getParams().setTcpNoDelay(true); client.getHttpConnectionManager().getParams().setLinger(1000); //失敗的情況下會進行3次嘗試,成功之后不會再嘗試 client.getHttpConnectionManager().getParams().setParameter(HttpMethodParams.RETRY_HANDLER, new DefaultHttpMethodRetryHandler()); }*/ public static int isBlack(int colorInt) { Color color = new Color(colorInt); if (color.getRed() + color.getGreen() + color.getBlue() <= 100) { return 1; } return 0; } public static int isWhite(int colorInt) { Color color = new Color(colorInt); if (color.getRed() + color.getGreen() + color.getBlue() > 600) { return 1; } return 0; } public static BufferedImage removeBackgroud(String picFile) throws Exception { BufferedImage img = ImageIO.read(new File(picFile)); img = img.getSubimage(1, 1, img.getWidth() - 2, img.getHeight() - 2); int width = img.getWidth(); int height = img.getHeight(); double subWidth = width / 5.0; for (int i = 0; i < 5; i++) { Map<Integer, Integer> map = new HashMap<Integer, Integer>(); for (int x = (int) (1 + i * subWidth); x < (i + 1) * subWidth && x < width - 1; ++x) { for (int y = 0; y < height; ++y) { if (isWhite(img.getRGB(x, y)) == 1) continue; if (map.containsKey(img.getRGB(x, y))) { map.put(img.getRGB(x, y), map.get(img.getRGB(x, y)) + 1); } else { map.put(img.getRGB(x, y), 1); } } } int max = 0; int colorMax = 0; for (Integer color : map.keySet()) { if (max < map.get(color)) { max = map.get(color); colorMax = color; } } for (int x = (int) (1 + i * subWidth); x < (i + 1) * subWidth && x < width - 1; ++x) { for (int y = 0; y < height; ++y) { if (img.getRGB(x, y) != colorMax) { img.setRGB(x, y, Color.WHITE.getRGB()); } else { img.setRGB(x, y, Color.BLACK.getRGB()); } } } } return img; } public static BufferedImage removeBlank(BufferedImage img) throws Exception { int width = img.getWidth(); int height = img.getHeight(); int start = 0; int end = 0; Label1: for (int y = 0; y < height; ++y) { for (int x = 0; x < width; ++x) { if (isBlack(img.getRGB(x, y)) == 1) { start = y; break Label1; } } } Label2: for (int y = height - 1; y >= 0; --y) { for (int x = 0; x < width; ++x) { if (isBlack(img.getRGB(x, y)) == 1) { end = y; break Label2; } } } return img.getSubimage(0, start, width, end - start + 1); } public static List<BufferedImage> splitImage(BufferedImage img) throws Exception { List<BufferedImage> subImgs = new ArrayList<BufferedImage>(); int width = img.getWidth(); int height = img.getHeight(); List<Integer> weightlist = new ArrayList<Integer>(); for (int x = 0; x < width; ++x) { int count = 0; for (int y = 0; y < height; ++y) { if (isBlack(img.getRGB(x, y)) == 1) { count++; } } weightlist.add(count); } for (int i = 0; i < weightlist.size();i++) { int length = 0; while (i < weightlist.size() && weightlist.get(i) > 0) { i++; length++; } if (length > 2) { subImgs.add(removeBlank(img.getSubimage(i - length, 0, length, height))); } } return subImgs; } public static Map<BufferedImage, String> loadTrainData() throws Exception { if (trainMap == null) { Map<BufferedImage, String> map = new HashMap<BufferedImage, String>(); File dir = new File(train_path); File[] files = dir.listFiles(); for (File file : files) { map.put(ImageIO.read(file), file.getName().charAt(0) + ""); } trainMap = map; } return trainMap; } public static String getSingleCharOcr(BufferedImage img, Map<BufferedImage, String> map) { String result = "#"; int width = img.getWidth(); int height = img.getHeight(); int min = width * height; for (BufferedImage bi : map.keySet()) { int count = 0; if (Math.abs(bi.getWidth()-width) > 2) continue; int widthmin = width < bi.getWidth() ? width : bi.getWidth(); int heightmin = height < bi.getHeight() ? height : bi.getHeight(); Label1: for (int x = 0; x < widthmin; ++x) { for (int y = 0; y < heightmin; ++y) { if (isBlack(img.getRGB(x, y)) != isBlack(bi.getRGB(x, y))) { count++; if (count >= min) break Label1; } } } if (count < min) { min = count; result = map.get(bi); } } return result; } public static String getAllOcr(String file) throws Exception { BufferedImage img = removeBackgroud(file);//去除重影 List<BufferedImage> listImg = splitImage(img);//切割圖片 Map<BufferedImage, String> map = loadTrainData(); String result = ""; for (BufferedImage bi : listImg) { result += getSingleCharOcr(bi, map); } //ImageIO.write(img, "JPG", new File("result6\\" + result + ".jpg")); return result; } /*** * 下載驗證碼圖片暫時保存供識別程序使用 * @param imgurl 驗證碼圖片url */ public static void downloadimg(String imgurl) { //HttpClient httpClient = new HttpClient(); //httpClient.getHttpConnectionManager().getParams().setConnectionTimeout(10000); //httpClient.getHttpConnectionManager().getParams().setSoTimeout(10000); GetMethod getMethod = new GetMethod(imgurl); try { int statusCode = client.executeMethod(getMethod); System.out.println(statusCode); if (statusCode != HttpStatus.SC_OK) { System.err.println("("+statusCode+")Method failed: "+ getMethod.getStatusLine()); logger.info("("+statusCode+")Method failed: "+ getMethod.getStatusLine()); } InputStream inputStream = getMethod.getResponseBodyAsStream(); OutputStream outStream = new FileOutputStream("/data/sata/share_sata/AmazonCrawl/amazonWeb/captcha.jpg"); IOUtils.copy(inputStream, outStream); inputStream.close(); outStream.close(); } catch (IOException e) { // TODO Auto-generated catch block //logger.info(new Date()+"captcha appear exception:"+e.getMessage()); try { //若遇到異常則睡眠20秒后繼續重試 Thread.sleep(20000); } catch (InterruptedException e1) { logger.error(e1); } e.printStackTrace(); }finally { getMethod.releaseConnection(); } } /*** * 抽取頁面驗證碼並返回 * @param stringBuffer * @return 驗證碼字符串 */ public static String GetCaptcha(StringBuilder html){ String captcha_str="######";//未識別則為# Document doc = Jsoup.parse(html.toString()); String imgurl = doc.select("div[class=a-row a-text-center]").get(0).child(0).attr("src"); //System.out.println(imgurl); downloadimg(imgurl); try { captcha_str = getAllOcr("/data/sata/share_sata/AmazonCrawl/amazonWeb/captcha.jpg"); } catch (Exception e) { // TODO Auto-generated catch block e.printStackTrace(); } return captcha_str; } }
后記:復雜驗證碼識別
對於復雜的驗證碼識別:目前的最簡單的方案就是交給第三方人工打碼平台:可以參考我做的EBay多線程打碼兔驗證碼解決方案:
http://www.cnblogs.com/zeze/p/6402963.html
更專業的可以采用機器學習、模式識別等方法去實現,但是識別成功率,我目前測試的結果不是很理想,復雜的驗證碼,正確率在百分之二三十上下,但是我的訓練樣本庫不是很大,提高訓練的樣本可能結果會好一點。