Java讀取doc、docx、xls、xlsx、ppt、pptx、pdf文件內容


讀取文件信息所需依賴

<!-- 讀取Excel XLS -->
<dependency>
    <groupId>org.apache.poi</groupId>
    <artifactId>poi</artifactId>
    <version>4.1.2</version>
</dependency>
<!-- 讀取PPT、DOC、Visio -->
<dependency>
    <groupId>org.apache.poi</groupId>
    <artifactId>poi-scratchpad</artifactId>
    <version>4.1.2</version>
</dependency>
<!-- 讀取Excel XLSX、PPTX、DOCX、-->
<dependency>
    <groupId>org.apache.poi</groupId>
    <artifactId>poi-ooxml</artifactId>
    <version>4.1.2</version>
</dependency>
<!--讀取pdf信息-->
<dependency>
    <groupId>org.apache.pdfbox</groupId>
    <artifactId>pdfbox</artifactId>
    <version>2.0.12</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.pdfbox/fontbox -->
<dependency>
    <groupId>org.apache.pdfbox</groupId>
    <artifactId>fontbox</artifactId>
    <version>2.0.12</version>
</dependency>

讀取doc文件內容

public static String readWord(String name)
{
    FileInputStream in;
    String text = null;
    try 
    {
        in = new FileInputStream(name);
        WordExtractor extractor = new WordExtractor(in);
        text = extractor.getText();
    } 
    catch (FileNotFoundException e) 
    {
        // TODO Auto-generated catch block
        e.printStackTrace();
    } catch (Exception e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
    }
    return text;
}

讀取docx文件內容

public static String readDoc(MultipartFile file) {
    if (file.isEmpty())return "";
    WordExtractor wordExtractor = null;
    try {
        InputStream inputStream = file.getInputStream();
        wordExtractor = new WordExtractor(inputStream);
    } catch (IOException e) {
        log.warn(e.toString());
        e.printStackTrace();
    }
    return wordExtractor.getText();
}

讀取xls文件內容

public static String readXls(MultipartFile file) {
    if (file.isEmpty()) return "";
    StringBuilder content = new StringBuilder();
    try {
        HSSFWorkbook excel = new HSSFWorkbook(file.getInputStream());
        //獲取第一個sheet
        HSSFSheet sheet0 = excel.getSheetAt(0);
        for (Iterator rowIterator = sheet0.iterator(); rowIterator.hasNext(); ) {
            HSSFRow row = (HSSFRow) rowIterator.next();
            for (Iterator iterator = row.cellIterator(); iterator.hasNext(); ) {
                HSSFCell cell = (HSSFCell) iterator.next();
                //根據單元的的類型 讀取相應的結果
                if (cell.getCellType() == CellType.STRING)
                    content.append(cell.getStringCellValue() + "\t");
                else if (cell.getCellType() == CellType.NUMERIC 
                         					|| cell.getCellType() == CellType.FORMULA)
                    content.append(cell.getNumericCellValue() + "\t");
                else
                    content.append("" + "\t");
            }
        }
    } catch (Exception e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
        log.warn(e.toString());
    }
    return content.toString();
}

讀取xlsx文件內容

public static String readXlsx(MultipartFile file) {
    if (file.isEmpty()) return "";
    StringBuilder content = new StringBuilder();
    try {
        XSSFWorkbook excel = new XSSFWorkbook(file.getInputStream());
        //獲取第一個sheet
        XSSFSheet sheet0 = excel.getSheetAt(0);
        for (Iterator rowIterator = sheet0.iterator(); rowIterator.hasNext(); ) {
            XSSFRow row = (XSSFRow) rowIterator.next();
            for (Iterator iterator = row.cellIterator(); iterator.hasNext(); ) {
                XSSFCell cell = (XSSFCell) iterator.next();
                //根據單元格的類型 讀取相應的結果
                if (cell.getCellType() == CellType.STRING)
                    content.append(cell.getStringCellValue() + "\t");
                else if (cell.getCellType() == CellType.NUMERIC 
                         				|| cell.getCellType() == CellType.FORMULA)
                    content.append(cell.getNumericCellValue() + "\t");
                else
                    content.append("" + "\t");
            }
        }
    } catch (Exception e) {
        e.printStackTrace();
        log.warn(e.toString());
    }
    return content.toString();
}

讀取pdf文件內容

/**
 * 讀取 PDF文本內容
 *
 * @Param: MultipartFile
 * @return: pdf文本內容
 */
public static String readPdf(MultipartFile file) {
    StringBuilder content = new StringBuilder();
    try {
        InputStream is = file.getInputStream();
        PDFParser parser = new PDFParser(new RandomAccessBuffer(is));
        parser.parse();
        // 讀取文本內容
        PDDocument document = parser.getPDDocument();
        // 獲取頁碼
        int pages = document.getNumberOfPages();
        PDFTextStripper stripper = new PDFTextStripper();
        // 設置按順序輸出
        stripper.setSortByPosition(true);
        stripper.setStartPage(1);
        stripper.setEndPage(pages);
        content.append(stripper.getText(document));

    } catch (Exception e) {
        e.printStackTrace();
        log.warn(e.toString());
    }
    return content.toString();
}

PDF文件加載有兩種方式,無明顯差異,方式二代碼較簡潔:

// 方式一:         
InputStream input = null;
input = new FileInputStream( pdfFile );
//加載 pdf 文檔
PDFParser parser = new PDFParser(new RandomAccessBuffer(input));
parser.parse();
document = parser.getPDDocument();

 // 方式二:
document=PDDocument.load(pdfFile);   

讀取ppt文件內容

public static String readPPT(MultipartFile file) {
    if (file.isEmpty()) return "";
    StringBuilder content = new StringBuilder();
    try {
        InputStream is = file.getInputStream();
        HSLFSlideShow hslfSlideShow = new HSLFSlideShow(is);
        List<HSLFSlide> slides = hslfSlideShow.getSlides();
        SlideShowExtractor slideShowExtractor = new SlideShowExtractor(hslfSlideShow);

        for (HSLFSlide slide : slides) {
            content.append(slideShowExtractor.getText(slide));
        }
        slideShowExtractor.close();
    } catch (IOException e) {
        log.warn(e.toString());
        e.printStackTrace();
    }
    return content.toString();
}

讀取pptx文件內容

public static String readPPTX(MultipartFile file) {
    if (file.isEmpty()) return "";
    StringBuffer content = new StringBuffer();
    try {
        InputStream is = file.getInputStream();
        XMLSlideShow xmlSlideShow = new XMLSlideShow(is);
        List<XSLFSlide> slides = xmlSlideShow.getSlides();            //獲得每一張幻燈片
        for (XSLFSlide slide : slides) {
            CTSlide rawSlide = slide.getXmlObject();
            CTGroupShape spTree = rawSlide.getCSld().getSpTree();
            List<CTShape> spList = spTree.getSpList();
            for (CTShape shape : spList) {
                CTTextBody txBody = shape.getTxBody();
                if (null == txBody) {
                    continue;
                }
                List<CTTextParagraph> pList = txBody.getPList();
                for (CTTextParagraph textParagraph : pList) {
                    List<CTRegularTextRun> textRuns = textParagraph.getRList();
                    for (CTRegularTextRun textRun : textRuns) {
                        content.append(textRun.getT());
                    }
                }
            }
        }
        xmlSlideShow.close();
    } catch (Exception e) {
        e.printStackTrace();
    }
    return content.toString();
}


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM