通過spring boot 讀取文件
只能簡單的讀出文本。。數據處理可能還得再百度
依賴
<!-- 文件讀寫 --> <dependency> <groupId>org.apache.poi</groupId> <artifactId>poi</artifactId> <version>3.15</version> </dependency> <!-- .docx解析依賴 --> <dependency> <groupId>org.apache.poi</groupId> <artifactId>poi-ooxml</artifactId> <version>3.15</version> </dependency> <!-- .doc解析依賴 --> <dependency> <groupId>org.apache.poi</groupId> <artifactId>poi-scratchpad</artifactId> <version>3.15</version> </dependency> <!-- .pdf解析依賴 --> <dependency> <groupId>org.apache.pdfbox</groupId> <artifactId>pdfbox</artifactId> <version>2.0.4</version> </dependency> <!-- jxl 操作excel --> <dependency> <groupId>org.jxls</groupId> <artifactId>jxls-jexcel</artifactId> <version>1.0.6</version> </dependency>
傳入路徑 返回TXT 文本
package com.example.demo.read; import org.apache.poi.POIXMLDocument; import org.apache.poi.POIXMLTextExtractor; import org.apache.poi.hslf.extractor.PowerPointExtractor; import org.apache.poi.hssf.usermodel.HSSFCell; import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.poi.hwpf.extractor.WordExtractor; import org.apache.poi.openxml4j.opc.OPCPackage; import org.apache.poi.hssf.usermodel.HSSFWorkbook; import org.apache.poi.ss.usermodel.Cell; import org.apache.poi.ss.usermodel.Row; import org.apache.poi.ss.usermodel.Sheet; import org.apache.poi.ss.usermodel.Workbook; import org.apache.poi.xslf.usermodel.XMLSlideShow; import org.apache.poi.xslf.usermodel.XSLFSlide; import org.apache.poi.xssf.usermodel.XSSFWorkbook; import org.apache.poi.ss.usermodel.Workbook; import javax.servlet.ServletException; import javax.servlet.annotation.WebServlet; import javax.servlet.http.HttpServlet; import javax.servlet.http.HttpServletRequest; import javax.servlet.http.HttpServletResponse; import org.apache.poi.xwpf.extractor.XWPFWordExtractor; import org.apache.poi.xwpf.usermodel.XWPFDocument; import org.openxmlformats.schemas.drawingml.x2006.main.CTRegularTextRun; import org.openxmlformats.schemas.drawingml.x2006.main.CTTextBody; import org.openxmlformats.schemas.drawingml.x2006.main.CTTextParagraph; import org.openxmlformats.schemas.presentationml.x2006.main.CTGroupShape; import org.openxmlformats.schemas.presentationml.x2006.main.CTShape; import org.openxmlformats.schemas.presentationml.x2006.main.CTSlide; import org.apache.pdfbox.text.PDFTextStripper; import org.apache.pdfbox.pdmodel.PDDocument; import java.io.BufferedInputStream; import java.io.BufferedReader; import java.io.File; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.FileReader; import java.io.InputStream; import java.io.InputStreamReader; import java.text.NumberFormat; import java.util.ArrayList; import java.util.List; public class DocRead { /** * @Description: POI 讀取 word * @create: 2019-07-27 9:48 * @update logs * @throws Exception */ private static int maxx = 10000; //判斷編碼格式方法 private static String get_code(File sourceFile) { String charset = "GBK"; byte[] first3Bytes = new byte[3]; try { boolean checked = false; BufferedInputStream bis = new BufferedInputStream(new FileInputStream(sourceFile)); bis.mark(0); int read = bis.read(first3Bytes, 0, 3); if (read == -1) { bis.close(); return charset; //文件編碼為 ANSI } else if (first3Bytes[0] == (byte) 0xFF && first3Bytes[1] == (byte) 0xFE) { charset = "UTF-16LE"; //文件編碼為 Unicode checked = true; } else if (first3Bytes[0] == (byte) 0xFE && first3Bytes[1] == (byte) 0xFF) { charset = "UTF-16BE"; //文件編碼為 Unicode big endian checked = true; } else if (first3Bytes[0] == (byte) 0xEF && first3Bytes[1] == (byte) 0xBB && first3Bytes[2] == (byte) 0xBF) { charset = "UTF-8"; //文件編碼為 UTF-8 checked = true; } bis.reset(); if (!checked) { int loc = 0; while ((read = bis.read()) != -1) { loc++; if (read >= 0xF0) break; if (0x80 <= read && read <= 0xBF) // 單獨出現BF以下的,也算是GBK break; if (0xC0 <= read && read <= 0xDF) { read = bis.read(); if (0x80 <= read && read <= 0xBF) // 雙字節 (0xC0 - 0xDF) // (0x80 // - 0xBF),也可能在GB編碼內 continue; else break; } else if (0xE0 <= read && read <= 0xEF) {// 也有可能出錯,但是幾率較小 read = bis.read(); if (0x80 <= read && read <= 0xBF) { read = bis.read(); if (0x80 <= read && read <= 0xBF) { charset = "UTF-8"; break; } else break; } else break; } } } bis.close(); } catch (Exception e) { e.printStackTrace(); } return charset; } // @SuppressWarnings("resource") public static String readWord(String filePath) throws Exception{ if(filePath.equals("")) return null; // List<String> linList = new ArrayList<String>(); String buffer = ""; try { if (filePath.endsWith(".doc")) { InputStream fis = new FileInputStream(new File(filePath)); WordExtractor ex = new WordExtractor(fis); buffer = ex.getText(); fis.close(); ex.close(); } else if (filePath.endsWith(".docx")) { FileInputStream fis = new FileInputStream(filePath); XWPFDocument xdoc = new XWPFDocument(fis); XWPFWordExtractor ex = new XWPFWordExtractor(xdoc); buffer = ex.getText(); ex.close(); fis.close(); xdoc.close(); // return buffer; } else if(filePath.endsWith(".pdf")) { PDDocument ex; InputStream fis = new FileInputStream(new File(filePath)); ex = PDDocument.load(fis); PDFTextStripper stripper = new PDFTextStripper(); buffer = stripper.getText(ex); fis.close(); ex.close(); } else if(filePath.endsWith(".txt")) { File file = new File(filePath); String code = get_code(file); System.out.println("code: " + code); // code = "UTF-8"; InputStream is = new FileInputStream(file); InputStreamReader isr = new InputStreamReader(is, code); BufferedReader fis = new BufferedReader(isr); String linetxt = null; //result用來存儲文件內容 StringBuilder sb = new StringBuilder(); //按使用readLine方法,一次讀一行 while ((linetxt = fis.readLine()) != null && sb.length() < maxx) { System.out.println(linetxt); sb.append(linetxt); sb.append(" "); } is.close(); isr.close(); fis.close(); buffer = sb.toString(); // System.out.println("tex\n" + buffer); } else if(filePath.endsWith("xls") || filePath.endsWith("xlsx")) { StringBuilder sb = new StringBuilder(); FileInputStream fis = new FileInputStream(filePath); Workbook wb = null; //Workbook 不能close 關閉fis即可 if(filePath.endsWith("xsl")) { wb = new HSSFWorkbook(fis); } else { wb = new XSSFWorkbook(fis); } for(int sheetIndex = 0; sheetIndex < wb.getNumberOfSheets() && sb.length() < maxx; sheetIndex++) { Sheet sheet = wb.getSheetAt(sheetIndex); //讀取sheet 0 int firstRowIndex = sheet.getFirstRowNum(); //設置變量的第一行 int lastRowIndex = sheet.getLastRowNum(); //設置變量的行 System.out.println("firstRowIndex: "+firstRowIndex); System.out.println("lastRowIndex: "+lastRowIndex); for(int rIndex = firstRowIndex; rIndex <= lastRowIndex && sb.length() < maxx; rIndex++) { //遍歷行 System.out.println("rIndex: " + rIndex); Row row = sheet.getRow(rIndex); if (row != null) { int firstCellIndex = row.getFirstCellNum(); int lastCellIndex = row.getLastCellNum(); System.out.println("1c: " + firstCellIndex + "lc: " + lastCellIndex); for (int cIndex = firstCellIndex; cIndex < lastCellIndex && sb.length() < maxx; cIndex++) { //遍歷列 Cell cell = row.getCell(cIndex); System.out.println(cell); if (cell != null) { sb.append(cell.toString()); sb.append(" "); // System.out.println(cell.toString()); } } } } } fis.close(); buffer = sb.toString(); } else if(filePath.endsWith("ppt")) { FileInputStream fis = new FileInputStream(new File(filePath)); PowerPointExtractor ex=new PowerPointExtractor(fis); buffer = ex.getText(); fis.close(); ex.close(); } else if(filePath.endsWith("pptx")) { StringBuilder sb = new StringBuilder(); FileInputStream fis = new FileInputStream(filePath); XMLSlideShow xmlSlideShow = new XMLSlideShow(fis); List<XSLFSlide> slides = xmlSlideShow.getSlides(); for(XSLFSlide slide:slides){ CTSlide rawSlide = slide.getXmlObject(); CTGroupShape gs = rawSlide.getCSld().getSpTree(); CTShape[] shapes = gs.getSpArray(); for(CTShape shape:shapes){ CTTextBody tb = shape.getTxBody(); if(null==tb){ continue; } CTTextParagraph[] paras = tb.getPArray(); for(CTTextParagraph textParagraph:paras){ CTRegularTextRun[] textRuns = textParagraph.getRArray(); for(CTRegularTextRun textRun:textRuns){ sb.append(textRun.getT() + " "); } } } } buffer = sb.toString(); xmlSlideShow.close(); fis.close(); } else { return null; } buffer = buffer.replace("\n|\r", " "); // buffer = buffer.replace("'", " "); if(buffer.length() > maxx) buffer = buffer.substring(0,maxx); return buffer; } catch (Exception e) { System.out.print("error---->"+filePath); e.printStackTrace(); return null; } } }