pdf轉圖片、提取pdf文本、提取pdf圖片

本文轉載自查看原文 2019-01-08 16:06 1236

/**
   * 使用pdfbox提取pdf文檔的文字和圖片內容
   * pdfbox官網：https://pdfbox.apache.org/
   * maven依賴如下：
   * <dependency>
   * <groupId>org.apache.pdfbox</groupId>
   * <artifactId>fontbox</artifactId>
   * <version>2.0.1</version>
   * </dependency>
   * <dependency>
   * <groupId>org.apache.pdfbox</groupId>
   * <artifactId>pdfbox</artifactId>
   * <version>2.0.1</version>
   * </dependency>
   * <dependency>
   * <groupId>com.itextpdf</groupId>
   * <artifactId>itextpdf</artifactId>
   * <version>5.5.13</version>
   * </dependency>
   * <dependency>
   * <groupId>net.coobird</groupId>
   * <artifactId>thumbnailator</artifactId>
   * <version>0.4.8</version>
   * </dependency>
   */
   public class PdfTest {
   public static final String DATE_FORMAT = "yyyy-MM-dd HH:mm:ss";

   /**
   *
   *
   * @param pdfFilePath
   * @throws Exception
   */
   public static void extractText(String pdfFilePath) throws Exception{
   try (PDDocument document = PDDocument.load(new File(pdfFilePath)))
   {
   AccessPermission ap = document.getCurrentAccessPermission();
   if (!ap.canExtractContent())
   {
   throw new IOException("You do not have permission to extract text");
   }
   PDFTextStripper stripper = new PDFTextStripper();
   stripper.setSortByPosition(true);

   for (int p = 1; p <= document.getNumberOfPages(); ++p)
   {
   // 這里分為一頁一頁的提取，如果不設置，默認會把所有頁的內容一次性提取出來，根據需要選擇
   stripper.setStartPage(p);
   stripper.setEndPage(p);

   //提取內容就這一行代碼
   //提取內容很徹底，包括了頁眉頁腳的內容也都會被提出來
   String text = stripper.getText(document);

   String pageStr = String.format("page %d:", p);
   System.out.println(pageStr);
   //為了打印出來更美觀
   for (int i = 0; i < pageStr.length(); ++i)
   {
   System.out.print("-");
   }
   System.out.println();
   System.out.println(text.trim());
   System.out.println();
   }
   }
   }

   public static void pdfParse(String pdfPath) throws Exception {
   InputStream input = null;
   PDDocument document = null;
   try {
   document = PDDocument.load(new File(pdfPath));

   /** 文檔屬性信息 **/
   PDDocumentInformation info = document.getDocumentInformation();
   System.out.println("標題:" + info.getTitle());
   System.out.println("主題:" + info.getSubject());
   System.out.println("作者:" + info.getAuthor());
   System.out.println("關鍵字:" + info.getKeywords());

   System.out.println("應用程序:" + info.getCreator());
   System.out.println("pdf 制作程序:" + info.getProducer());

   System.out.println("作者:" + info.getTrapped());

   System.out.println("創建時間:" + dateFormat(info.getCreationDate()));
   System.out.println("修改時間:" + dateFormat(info.getModificationDate()));


   //獲取內容信息
   PDFTextStripper pts = new PDFTextStripper();
   String content = pts.getText(document);
   System.out.println("內容:" + content);

   /** 文檔頁面信息 **/
   PDDocumentCatalog cata = document.getDocumentCatalog();
   int count = 1;
   for (int i = 0; i < document.getNumberOfPages(); i++) {
   PDPage page = document.getPage(i);
   if (null != page) {
   //獲取到所有rescourse信息
   PDResources res = page.getResources();
   Iterable<COSName> xit = res.getXObjectNames();
   Iterator<COSName> iterator = xit.iterator();
   while (iterator.hasNext()){
   COSName cosName = iterator.next();
   System.out.println(cosName.getName());
   //判斷是否圖片資源，這個提取圖片也很徹底，包括頁眉頁腳的圖片也會被獲取到
   if(res.isImageXObject(cosName)){
   PDImageXObject pdImageXObject = (PDImageXObject)res.getXObject(cosName);
   //這里保存圖片我用了谷歌的thumbnailator框架，也可以用自己的方法去保存BufferedImage對象到本地圖片
   Thumbnails.of(pdImageXObject.getImage()).scale(0.9).toFile(new File("D:\\pdf\\"+System.currentTimeMillis()+".jpg"));
   }
   }
   }
   }
   } catch (Exception e) {
   throw e;
   } finally {
   if (null != input)
   input.close();
   if (null != document)
   document.close();
   }
   }

   /***
   * PDF文件轉PNG圖片，全部頁數
   *
   * @param PdfFilePath pdf完整路徑
   * @param dpi dpi越大轉換后越清晰，相對轉換速度越慢
   * @return
   */
   private static boolean pdf2Image(String PdfFilePath, String dstImgFolder, int dpi) {
   File file = new File(PdfFilePath);
   PDDocument pdDocument;
   try {
   String imgPDFPath = file.getParent();
   int dot = file.getName().lastIndexOf('.');
   String imagePDFName = file.getName().substring(0, dot); // 獲取圖片文件名
   String imgFolderPath = null;
   if (dstImgFolder.equals("")) {
   imgFolderPath = imgPDFPath + File.separator + imagePDFName;// 獲取圖片存放的文件夾路徑
   } else {
   imgFolderPath = dstImgFolder + File.separator + imagePDFName;
   }

   if (createDirectory(imgFolderPath)) {

   pdDocument = PDDocument.load(file);
   PDFRenderer renderer = new PDFRenderer(pdDocument);
   /* dpi越大轉換后越清晰，相對轉換速度越慢 */
   PdfReader reader = new PdfReader(PdfFilePath);
   int pages = reader.getNumberOfPages();
   StringBuffer imgFilePath = null;
   for (int i = 0; i < pages; i++) {
   String imgFilePathPrefix = imgFolderPath + File.separator + imagePDFName;
   imgFilePath = new StringBuffer();
   imgFilePath.append(imgFilePathPrefix);
   imgFilePath.append("_");
   imgFilePath.append(String.valueOf(formatNumber(i+1)));
   imgFilePath.append(".jpg");
   File dstFile = new File(imgFilePath.toString());
   BufferedImage image = renderer.renderImageWithDPI(i, dpi);


   ImageWriter writer = ImageIO.getImageWritersByFormatName("jpg").next();
   writer.setOutput(ImageIO.createImageOutputStream(dstFile));
   ImageWriteParam param = writer.getDefaultWriteParam();
   param.setCompressionMode(ImageWriteParam.MODE_EXPLICIT);
   param.setCompressionQuality(0.3f);
   writer.write(null, new IIOImage(image, null, null), param);

   // ImageIO.write(image, "jpg", dstFile);
   }
   System.out.println("PDF文檔轉圖片成功！"+dstImgFolder);
   return true;
   } else {
   System.out.println("PDF文檔轉圖片失敗：" + "創建" + imgFolderPath + "失敗");
   }

   } catch (IOException e) {
   e.printStackTrace();
   }
   return false;
   }

   private static String formatNumber(int i){
   if(i<10){
   return "00"+i;
   }else if(i<100){
   return "0"+i;
   }else{
   return i+"";
   }
   }

   private static boolean createDirectory(String folder) {
   File dir = new File(folder);
   if (dir.exists()) {
   return true;
   } else {
   return dir.mkdirs();
   }
   }

   public static String dateFormat(Calendar calendar) throws Exception {
   if (null == calendar)
   return null;
   String date = null;
   try {
   String pattern = DATE_FORMAT;
   SimpleDateFormat format = new SimpleDateFormat(pattern);
   date = format.format(calendar.getTime());
   } catch (Exception e) {
   throw e;
   }
   return date == null ? "" : date;
   }

   }

免責聲明！

本站轉載的文章為個人學習借鑒使用，本站對版權不負任何法律責任。如果侵犯了您的隱私權益，請聯系本站郵箱yoyou2525@163.com刪除。

猜您在找 Delphi提取PDF文本如何使用免費PDF控件從PDF文檔中提取文本和圖片用Spire.PDF提取PDF里的PNG圖片 Java 添加、提取PDF中的圖片【python】提取pdf文件中的所有圖片 python如何提取pdf文件圖片中的文字？ Python操作PDF-文本和圖片提取（使用PyPDF2和PyMuPDF） Python | 圖片轉pdf Java PDF轉圖片 pdfbox pdf轉圖片