/**
* 使用pdfbox提取pdf文檔的文字和圖片內容
* pdfbox官網:https://pdfbox.apache.org/
* maven依賴如下:
* <dependency>
* <groupId>org.apache.pdfbox</groupId>
* <artifactId>fontbox</artifactId>
* <version>2.0.1</version>
* </dependency>
* <dependency>
* <groupId>org.apache.pdfbox</groupId>
* <artifactId>pdfbox</artifactId>
* <version>2.0.1</version>
* </dependency>
* <dependency>
* <groupId>com.itextpdf</groupId>
* <artifactId>itextpdf</artifactId>
* <version>5.5.13</version>
* </dependency>
* <dependency>
* <groupId>net.coobird</groupId>
* <artifactId>thumbnailator</artifactId>
* <version>0.4.8</version>
* </dependency>
*/
public class PdfTest {
public static final String DATE_FORMAT = "yyyy-MM-dd HH:mm:ss";
/**
*
*
* @param pdfFilePath
* @throws Exception
*/
public static void extractText(String pdfFilePath) throws Exception{
try (PDDocument document = PDDocument.load(new File(pdfFilePath)))
{
AccessPermission ap = document.getCurrentAccessPermission();
if (!ap.canExtractContent())
{
throw new IOException("You do not have permission to extract text");
}
PDFTextStripper stripper = new PDFTextStripper();
stripper.setSortByPosition(true);
for (int p = 1; p <= document.getNumberOfPages(); ++p)
{
// 這里分為一頁一頁的提取,如果不設置,默認會把所有頁的內容一次性提取出來,根據需要選擇
stripper.setStartPage(p);
stripper.setEndPage(p);
//提取內容就這一行代碼
//提取內容很徹底,包括了頁眉頁腳的內容也都會被提出來
String text = stripper.getText(document);
String pageStr = String.format("page %d:", p);
System.out.println(pageStr);
//為了打印出來更美觀
for (int i = 0; i < pageStr.length(); ++i)
{
System.out.print("-");
}
System.out.println();
System.out.println(text.trim());
System.out.println();
}
}
}
public static void pdfParse(String pdfPath) throws Exception {
InputStream input = null;
PDDocument document = null;
try {
document = PDDocument.load(new File(pdfPath));
/** 文檔屬性信息 **/
PDDocumentInformation info = document.getDocumentInformation();
System.out.println("標題:" + info.getTitle());
System.out.println("主題:" + info.getSubject());
System.out.println("作者:" + info.getAuthor());
System.out.println("關鍵字:" + info.getKeywords());
System.out.println("應用程序:" + info.getCreator());
System.out.println("pdf 制作程序:" + info.getProducer());
System.out.println("作者:" + info.getTrapped());
System.out.println("創建時間:" + dateFormat(info.getCreationDate()));
System.out.println("修改時間:" + dateFormat(info.getModificationDate()));
//獲取內容信息
PDFTextStripper pts = new PDFTextStripper();
String content = pts.getText(document);
System.out.println("內容:" + content);
/** 文檔頁面信息 **/
PDDocumentCatalog cata = document.getDocumentCatalog();
int count = 1;
for (int i = 0; i < document.getNumberOfPages(); i++) {
PDPage page = document.getPage(i);
if (null != page) {
//獲取到所有rescourse信息
PDResources res = page.getResources();
Iterable<COSName> xit = res.getXObjectNames();
Iterator<COSName> iterator = xit.iterator();
while (iterator.hasNext()){
COSName cosName = iterator.next();
System.out.println(cosName.getName());
//判斷是否圖片資源,這個提取圖片也很徹底,包括頁眉頁腳的圖片也會被獲取到
if(res.isImageXObject(cosName)){
PDImageXObject pdImageXObject = (PDImageXObject)res.getXObject(cosName);
//這里保存圖片我用了谷歌的thumbnailator框架,也可以用自己的方法去保存BufferedImage對象到本地圖片
Thumbnails.of(pdImageXObject.getImage()).scale(0.9).toFile(new File("D:\\pdf\\"+System.currentTimeMillis()+".jpg"));
}
}
}
}
} catch (Exception e) {
throw e;
} finally {
if (null != input)
input.close();
if (null != document)
document.close();
}
}
/***
* PDF文件轉PNG圖片,全部頁數
*
* @param PdfFilePath pdf完整路徑
* @param dpi dpi越大轉換后越清晰,相對轉換速度越慢
* @return
*/
private static boolean pdf2Image(String PdfFilePath, String dstImgFolder, int dpi) {
File file = new File(PdfFilePath);
PDDocument pdDocument;
try {
String imgPDFPath = file.getParent();
int dot = file.getName().lastIndexOf('.');
String imagePDFName = file.getName().substring(0, dot); // 獲取圖片文件名
String imgFolderPath = null;
if (dstImgFolder.equals("")) {
imgFolderPath = imgPDFPath + File.separator + imagePDFName;// 獲取圖片存放的文件夾路徑
} else {
imgFolderPath = dstImgFolder + File.separator + imagePDFName;
}
if (createDirectory(imgFolderPath)) {
pdDocument = PDDocument.load(file);
PDFRenderer renderer = new PDFRenderer(pdDocument);
/* dpi越大轉換后越清晰,相對轉換速度越慢 */
PdfReader reader = new PdfReader(PdfFilePath);
int pages = reader.getNumberOfPages();
StringBuffer imgFilePath = null;
for (int i = 0; i < pages; i++) {
String imgFilePathPrefix = imgFolderPath + File.separator + imagePDFName;
imgFilePath = new StringBuffer();
imgFilePath.append(imgFilePathPrefix);
imgFilePath.append("_");
imgFilePath.append(String.valueOf(formatNumber(i+1)));
imgFilePath.append(".jpg");
File dstFile = new File(imgFilePath.toString());
BufferedImage image = renderer.renderImageWithDPI(i, dpi);
ImageWriter writer = ImageIO.getImageWritersByFormatName("jpg").next();
writer.setOutput(ImageIO.createImageOutputStream(dstFile));
ImageWriteParam param = writer.getDefaultWriteParam();
param.setCompressionMode(ImageWriteParam.MODE_EXPLICIT);
param.setCompressionQuality(0.3f);
writer.write(null, new IIOImage(image, null, null), param);
// ImageIO.write(image, "jpg", dstFile);
}
System.out.println("PDF文檔轉圖片成功!"+dstImgFolder);
return true;
} else {
System.out.println("PDF文檔轉圖片失敗:" + "創建" + imgFolderPath + "失敗");
}
} catch (IOException e) {
e.printStackTrace();
}
return false;
}
private static String formatNumber(int i){
if(i<10){
return "00"+i;
}else if(i<100){
return "0"+i;
}else{
return i+"";
}
}
private static boolean createDirectory(String folder) {
File dir = new File(folder);
if (dir.exists()) {
return true;
} else {
return dir.mkdirs();
}
}
public static String dateFormat(Calendar calendar) throws Exception {
if (null == calendar)
return null;
String date = null;
try {
String pattern = DATE_FORMAT;
SimpleDateFormat format = new SimpleDateFormat(pattern);
date = format.format(calendar.getTime());
} catch (Exception e) {
throw e;
}
return date == null ? "" : date;
}
}
