獲取 pdf 關鍵字坐標


package Demo.qd;

import com.itextpdf.awt.geom.Rectangle2D.Float;
import com.itextpdf.text.pdf.PdfDictionary;
import com.itextpdf.text.pdf.PdfName;
import com.itextpdf.text.pdf.PdfReader;
import com.itextpdf.text.pdf.parser.*;

import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.math.BigDecimal;
import java.util.ArrayList;
import java.util.List;

public class PdfPositionTool {

    public static void main(String[] args) throws Exception {
        PdfPositionTool pdfPositionTool = new PdfPositionTool();
        List<double[]> positions = pdfPositionTool.getPositions("/Users/yourouniu/Desktop/111.pdf", "%蓋章處%");
        if (positions != null && positions.size() > 0) {
            for (double[] position : positions) {
                System.out.println("pageNum: " + (int) position[0]);
                System.out.println("x: " + position[1]);
                System.out.println("y: " + position[2]);
            }
        }
    }

    /**
     * @return List<float [ ]> 坐標數組:float[0]:頁碼,float[1]:x ,float[2]:y
     * @Description 獲取關鍵字坐標
     * @Param filePath:pdf 路徑
     * @Param keyword:關鍵字
     */
    public List<double[]> getPositions(String filePath, String keyword) throws IOException {
        PdfPositionTool pdfPositionTool = new PdfPositionTool();
        //1.給定文件
        File pdfFile = new File(filePath);
        //2.定義一個byte數組,長度為文件的長度
        byte[] pdfData = new byte[(int) pdfFile.length()];
        //3.IO流讀取文件內容到byte數組
        FileInputStream inputStream = null;
        try {
            inputStream = new FileInputStream(pdfFile);
            inputStream.read(pdfData);
        } catch (IOException e) {
            throw e;
        } finally {
            if (inputStream != null) {
                try {
                    inputStream.close();
                } catch (IOException e) {
                }
            }
        }
        //5.調用方法,給定關鍵字和文件
        List<double[]> positions = pdfPositionTool.findKeywordPostions(pdfData, keyword);
        return positions;
    }

    /**
     * @Description pdf 坐標轉換為 ofd 坐標,比值為 25.4/72 ,該轉換存在誤差
     * 最好的轉換方式為按距離原點的百分比計算
     */
    private double transForPosition(double pdfPosition) {
        double ofdPosition = pdfPosition * 25.4 / 72;
        return ofdPosition;
    }


    /**
     * @param pdfData 通過IO流 PDF文件轉化的byte數組
     * @param keyword 關鍵字
     * @return List<float [ ]> : float[0]:pageNum float[1]:x float[2]:y
     * @throws IOException
     */
    public List<double[]> findKeywordPostions(byte[] pdfData, String keyword) throws IOException {
        List<double[]> result = new ArrayList<>();
        List<PdfPageContentPositions> pdfPageContentPositions = getPdfContentPostionsList(pdfData);
        for (PdfPageContentPositions pdfPageContentPosition : pdfPageContentPositions) {
            List<double[]> charPositions = findPositions(keyword, pdfPageContentPosition);
            if (charPositions == null || charPositions.size() < 1) {
                continue;
            }
            result.addAll(charPositions);
        }
        return result;
    }


    private List<PdfPageContentPositions> getPdfContentPostionsList(byte[] pdfData) throws IOException {
        PdfReader reader = new PdfReader(pdfData);
        List<PdfPageContentPositions> result = new ArrayList<>();
        int pages = reader.getNumberOfPages();
        for (int pageNum = 1; pageNum <= pages; pageNum++) {
            float width = reader.getPageSize(pageNum).getWidth();
            float height = reader.getPageSize(pageNum).getHeight();
            PdfRenderListener pdfRenderListener = new PdfRenderListener(pageNum, width, height);
            //解析pdf,定位位置
            PdfContentStreamProcessor processor = new PdfContentStreamProcessor(pdfRenderListener);
            PdfDictionary pageDic = reader.getPageN(pageNum);
            PdfDictionary resourcesDic = pageDic.getAsDict(PdfName.RESOURCES);
            try {
                processor.processContent(ContentByteUtils.getContentBytesForPage(reader, pageNum), resourcesDic);
            } catch (IOException e) {
                reader.close();
                throw e;
            }
            String content = pdfRenderListener.getContent();
            List<CharPosition> charPositions = pdfRenderListener.getcharPositions();
            List<double[]> positionsList = new ArrayList<>();
            for (CharPosition charPosition : charPositions) {
                double[] positions = new double[]{charPosition.getPageNum(), charPosition.getX(), charPosition.getY()};
                positionsList.add(positions);
            }
            PdfPageContentPositions pdfPageContentPositions = new PdfPageContentPositions();
            pdfPageContentPositions.setContent(content);
            pdfPageContentPositions.setPostions(positionsList);
            result.add(pdfPageContentPositions);
        }
        reader.close();
        return result;
    }

    private static List<double[]> findPositions(String keyword, PdfPageContentPositions pdfPageContentPositions) {
        List<double[]> result = new ArrayList<>();
        String content = pdfPageContentPositions.getContent();
        List<double[]> charPositions = pdfPageContentPositions.getPositions();
        for (int pos = 0; pos < content.length(); ) {
            int positionIndex = content.indexOf(keyword, pos);
            if (positionIndex == -1) {
                break;
            }
            double[] postions = charPositions.get(positionIndex);
            result.add(postions);
            pos = positionIndex + 1;
        }
        return result;
    }

    private class PdfPageContentPositions {
        private String content;
        private List<double[]> positions;

        public String getContent() {
            return content;
        }

        public void setContent(String content) {
            this.content = content;
        }

        public List<double[]> getPositions() {
            return positions;
        }

        public void setPostions(List<double[]> positions) {
            this.positions = positions;
        }
    }

    private class PdfRenderListener implements RenderListener {
        private int pageNum;
        private float pageWidth;
        private float pageHeight;
        private StringBuilder contentBuilder = new StringBuilder();
        private List<CharPosition> charPositions = new ArrayList<>();

        public PdfRenderListener(int pageNum, float pageWidth, float pageHeight) {
            this.pageNum = pageNum;
            this.pageWidth = pageWidth;
            this.pageHeight = pageHeight;
        }

        public void beginTextBlock() {
        }

        /**
         * @Description 計算轉換后的 ofd 坐標值
         * 如有需要,可轉為計算距離原點的百分比值。在知道 ofd 長寬的情況下,用百分比重新計算坐標更精確
         */
        public void renderText(TextRenderInfo renderInfo) {
            List<TextRenderInfo> characterRenderInfos = renderInfo.getCharacterRenderInfos();
            for (TextRenderInfo textRenderInfo : characterRenderInfos) {
                String word = textRenderInfo.getText();
                if (word.length() > 1) {
                    word = word.substring(word.length() - 1, word.length());
                }
                Float rectangle = textRenderInfo.getAscentLine().getBoundingRectange();
                float x = (float) rectangle.getX();
                float y = (float) rectangle.getY();
                //這兩個是關鍵字在所在頁面的XY軸的百分比
                float xPercent = Math.round(x / pageWidth * 10000) / 10000f;
                // pdf 原點在左下,ofd 原點在左上
                float yPercent = Math.round((1 - y / pageHeight) * 10000) / 10000f;
                CharPosition charPosition = new CharPosition(pageNum, transForPosition(x),
                        transForPosition((yPercent) * pageHeight));
                charPositions.add(charPosition);
                contentBuilder.append(word);
            }
        }

        public void endTextBlock() {
        }

        public void renderImage(ImageRenderInfo renderInfo) {
        }

        public String getContent() {
            return contentBuilder.toString();
        }

        public List<CharPosition> getcharPositions() {
            return charPositions;
        }
    }

    private class CharPosition {
        private int pageNum = 0;
        private double x = 0;
        private double y = 0;

        public CharPosition(int pageNum, double x, double y) {
            this.pageNum = pageNum;
            this.x = x;
            this.y = y;
        }

        public int getPageNum() {
            return pageNum;
        }

        public double getX() {
            return x;
        }

        public double getY() {
            return y;
        }

        @Override
        public String toString() {
            return "[pageNum=" + this.pageNum + ",x=" + this.x + ",y=" + this.y + "]";
        }
    }
}

 


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM