讀取文件信息所需依賴
<!-- 讀取Excel XLS -->
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi</artifactId>
<version>4.1.2</version>
</dependency>
<!-- 讀取PPT、DOC、Visio -->
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-scratchpad</artifactId>
<version>4.1.2</version>
</dependency>
<!-- 讀取Excel XLSX、PPTX、DOCX、-->
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-ooxml</artifactId>
<version>4.1.2</version>
</dependency>
<!--讀取pdf信息-->
<dependency>
<groupId>org.apache.pdfbox</groupId>
<artifactId>pdfbox</artifactId>
<version>2.0.12</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.pdfbox/fontbox -->
<dependency>
<groupId>org.apache.pdfbox</groupId>
<artifactId>fontbox</artifactId>
<version>2.0.12</version>
</dependency>
讀取doc文件內容
public static String readWord(String name)
{
FileInputStream in;
String text = null;
try
{
in = new FileInputStream(name);
WordExtractor extractor = new WordExtractor(in);
text = extractor.getText();
}
catch (FileNotFoundException e)
{
// TODO Auto-generated catch block
e.printStackTrace();
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
return text;
}
讀取docx文件內容
public static String readDoc(MultipartFile file) {
if (file.isEmpty())return "";
WordExtractor wordExtractor = null;
try {
InputStream inputStream = file.getInputStream();
wordExtractor = new WordExtractor(inputStream);
} catch (IOException e) {
log.warn(e.toString());
e.printStackTrace();
}
return wordExtractor.getText();
}
讀取xls文件內容
public static String readXls(MultipartFile file) {
if (file.isEmpty()) return "";
StringBuilder content = new StringBuilder();
try {
HSSFWorkbook excel = new HSSFWorkbook(file.getInputStream());
//獲取第一個sheet
HSSFSheet sheet0 = excel.getSheetAt(0);
for (Iterator rowIterator = sheet0.iterator(); rowIterator.hasNext(); ) {
HSSFRow row = (HSSFRow) rowIterator.next();
for (Iterator iterator = row.cellIterator(); iterator.hasNext(); ) {
HSSFCell cell = (HSSFCell) iterator.next();
//根據單元的的類型 讀取相應的結果
if (cell.getCellType() == CellType.STRING)
content.append(cell.getStringCellValue() + "\t");
else if (cell.getCellType() == CellType.NUMERIC
|| cell.getCellType() == CellType.FORMULA)
content.append(cell.getNumericCellValue() + "\t");
else
content.append("" + "\t");
}
}
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
log.warn(e.toString());
}
return content.toString();
}
讀取xlsx文件內容
public static String readXlsx(MultipartFile file) {
if (file.isEmpty()) return "";
StringBuilder content = new StringBuilder();
try {
XSSFWorkbook excel = new XSSFWorkbook(file.getInputStream());
//獲取第一個sheet
XSSFSheet sheet0 = excel.getSheetAt(0);
for (Iterator rowIterator = sheet0.iterator(); rowIterator.hasNext(); ) {
XSSFRow row = (XSSFRow) rowIterator.next();
for (Iterator iterator = row.cellIterator(); iterator.hasNext(); ) {
XSSFCell cell = (XSSFCell) iterator.next();
//根據單元格的類型 讀取相應的結果
if (cell.getCellType() == CellType.STRING)
content.append(cell.getStringCellValue() + "\t");
else if (cell.getCellType() == CellType.NUMERIC
|| cell.getCellType() == CellType.FORMULA)
content.append(cell.getNumericCellValue() + "\t");
else
content.append("" + "\t");
}
}
} catch (Exception e) {
e.printStackTrace();
log.warn(e.toString());
}
return content.toString();
}
讀取pdf文件內容
/**
* 讀取 PDF文本內容
*
* @Param: MultipartFile
* @return: pdf文本內容
*/
public static String readPdf(MultipartFile file) {
StringBuilder content = new StringBuilder();
try {
InputStream is = file.getInputStream();
PDFParser parser = new PDFParser(new RandomAccessBuffer(is));
parser.parse();
// 讀取文本內容
PDDocument document = parser.getPDDocument();
// 獲取頁碼
int pages = document.getNumberOfPages();
PDFTextStripper stripper = new PDFTextStripper();
// 設置按順序輸出
stripper.setSortByPosition(true);
stripper.setStartPage(1);
stripper.setEndPage(pages);
content.append(stripper.getText(document));
} catch (Exception e) {
e.printStackTrace();
log.warn(e.toString());
}
return content.toString();
}
PDF文件加載有兩種方式,無明顯差異,方式二代碼較簡潔:
// 方式一:
InputStream input = null;
input = new FileInputStream( pdfFile );
//加載 pdf 文檔
PDFParser parser = new PDFParser(new RandomAccessBuffer(input));
parser.parse();
document = parser.getPDDocument();
// 方式二:
document=PDDocument.load(pdfFile);
讀取ppt文件內容
public static String readPPT(MultipartFile file) {
if (file.isEmpty()) return "";
StringBuilder content = new StringBuilder();
try {
InputStream is = file.getInputStream();
HSLFSlideShow hslfSlideShow = new HSLFSlideShow(is);
List<HSLFSlide> slides = hslfSlideShow.getSlides();
SlideShowExtractor slideShowExtractor = new SlideShowExtractor(hslfSlideShow);
for (HSLFSlide slide : slides) {
content.append(slideShowExtractor.getText(slide));
}
slideShowExtractor.close();
} catch (IOException e) {
log.warn(e.toString());
e.printStackTrace();
}
return content.toString();
}
讀取pptx文件內容
public static String readPPTX(MultipartFile file) {
if (file.isEmpty()) return "";
StringBuffer content = new StringBuffer();
try {
InputStream is = file.getInputStream();
XMLSlideShow xmlSlideShow = new XMLSlideShow(is);
List<XSLFSlide> slides = xmlSlideShow.getSlides(); //獲得每一張幻燈片
for (XSLFSlide slide : slides) {
CTSlide rawSlide = slide.getXmlObject();
CTGroupShape spTree = rawSlide.getCSld().getSpTree();
List<CTShape> spList = spTree.getSpList();
for (CTShape shape : spList) {
CTTextBody txBody = shape.getTxBody();
if (null == txBody) {
continue;
}
List<CTTextParagraph> pList = txBody.getPList();
for (CTTextParagraph textParagraph : pList) {
List<CTRegularTextRun> textRuns = textParagraph.getRList();
for (CTRegularTextRun textRun : textRuns) {
content.append(textRun.getT());
}
}
}
}
xmlSlideShow.close();
} catch (Exception e) {
e.printStackTrace();
}
return content.toString();
}