Java poi 读取 word 、 pdf


🤔从各个博客 CV 出来的,不好意思

pom

	<dependency>
            <groupId>org.apache.poi</groupId>
            <artifactId>poi</artifactId>
            <version>4.1.2</version>
        </dependency>
        <!-- https://mvnrepository.com/artifact/org.apache.poi/poi-ooxml -->
        <dependency>
            <groupId>org.apache.poi</groupId>
            <artifactId>poi-ooxml</artifactId>
            <version>4.1.2</version>
        </dependency>
        <!-- https://mvnrepository.com/artifact/org.apache.poi/poi-ooxml-schemas -->
        <dependency>
            <groupId>org.apache.poi</groupId>
            <artifactId>poi-ooxml-schemas</artifactId>
            <version>4.1.2</version>
        </dependency>
        <dependency>
            <groupId>org.apache.poi</groupId>
            <artifactId>ooxml-schemas</artifactId>
            <version>1.4</version>
        </dependency>
        <!-- https://mvnrepository.com/artifact/org.apache.poi/poi-scratchpad -->
        <dependency>
            <groupId>org.apache.poi</groupId>
            <artifactId>poi-scratchpad</artifactId>
            <version>4.1.2</version>
        </dependency>
        <dependency>
            <groupId>org.apache.commons</groupId>
            <artifactId>commons-compress</artifactId>
            <version>1.21</version>
        </dependency>
        <!--读取pdf信息-->
        <dependency>
            <groupId>org.apache.pdfbox</groupId>
            <artifactId>pdfbox</artifactId>
            <version>2.0.12</version>
        </dependency>
        <!-- https://mvnrepository.com/artifact/org.apache.pdfbox/fontbox -->
        <dependency>
            <groupId>org.apache.pdfbox</groupId>
            <artifactId>fontbox</artifactId>
            <version>2.0.12</version>
        </dependency>	

按段落 读取 docx

    @SneakyThrows
    private void readDocx(MultipartFile file) {
        InputStream inputStream = file.getInputStream();
        XWPFDocument document = new XWPFDocument(inputStream);
      	// 读取段落
        List<XWPFParagraph> paragraphs = document.getParagraphs();
        List<WordFileInfo> infos = new ArrayList<>();
        for (XWPFParagraph paragraph : paragraphs) {
            String text = paragraph.getParagraphText();
        }
      
    }

按段落 读取 doc

    @SneakyThrows
    private void readDoc(MultipartFile file) {
        InputStream inputStream = file.getInputStream();
        HWPFDocument document = new HWPFDocument(inputStream);
        Range range = document.getRange();
        List<WordFileInfo> infos = new ArrayList<>();
        for (int i = 0; i < range.numParagraphs(); i++) {
            Paragraph paragraph = range.getParagraph(i);
            String text = paragraph.text();
        }
    }

读取 pdf

/**
     * 读取 pdf 文件内容
     *
     * @param inputStream
     * @return
     */
    private String readPDF(InputStream inputStream) {
        StringBuilder content = new StringBuilder();
        try {
            RandomAccessBuffer buffer = new RandomAccessBuffer(inputStream);
            PDFParser pdfParser = new PDFParser(buffer);
            pdfParser.parse();
            PDDocument document = pdfParser.getPDDocument();
            // 获取页码
            int pages = document.getNumberOfPages();
            PDFTextStripper stripper = new PDFTextStripper();
            // 设置按顺序输出
            stripper.setSortByPosition(true);
            stripper.setStartPage(1);
            stripper.setEndPage(pages);
            content.append(stripper.getText(document));
        } catch (Exception e) {
            e.printStackTrace();
        }
        return content.toString();

    }


免责声明!

本站转载的文章为个人学习借鉴使用,本站对版权不负任何法律责任。如果侵犯了您的隐私权益,请联系本站邮箱yoyou2525@163.com删除。



 
粤ICP备18138465号  © 2018-2025 CODEPRJ.COM